Note: If you need permission to use some links, please email at: sjw6236@psu.edu
# install.packages("dplyr")
# install.packages('car')
# install.packages("ggplot2")
#load package here
library(tidyverse)
library(gridBase)
library(grid)
library(dplyr)
library(car)
library (GGally)
library(readxl)
library(ggplot2)
library(broom)
library(ggpubr)
library(corrplot)
library(MASS)
library(Stat2Data)
library(olsrr)
library(leaps)
library(leaps)
library(caret)
library(lmtest)
library(nortest)
library(Metrics)
library(broom.mixed)
library(ggbreak)
library(gridExtra)
library(tinytex)
library(randomForest)
library(ranger)
options(scipen=999) # turn-off scientific notation like 1e+48
theme_set(theme_bw()) # pre-set the bw theme.
## Read raw data: weather features
weather_raw<-read.csv(file = "Data/weather_features.csv")
head(weather_raw,4)
# All the columns in the raw data
weather_raw_fields <- colnames(weather_raw)
print(weather_raw_fields)
[1] "dt_iso" "city_name" "temp" "temp_min" "temp_max" "pressure"
[7] "humidity" "wind_speed" "wind_deg" "rain_1h" "rain_3h" "snow_3h"
[13] "clouds_all" "weather_id" "weather_main" "weather_description" "weather_icon"
# All the duplicate data based on date
weather_duplicate<-weather_raw[duplicated(weather_raw[,1:2]),]
weather <- weather_raw[!duplicated(weather_raw[,1:2]), ]
(Keith,Can you explain here in bullets how new excel sheet is generated in bullet?) * Use $$ for math equation, if any and reference the original data, and other links like I did in the beginning of this code
energy_raw<-read_excel("Data/energy_dataset-KO.xlsx")
New names:
# Valencia
Weather_Valencia <- weather[weather$city_name == 'Valencia',]
Data_Valencia <-merge(Weather_Valencia, energy_raw[,c("time","Valencia")], by.x = "dt_iso", by.y = "time")
colnames(Data_Valencia)[colnames(Data_Valencia) == "Valencia"] <- "energy"
# Barcelona
Weather_Barcelona <- weather[weather$city_name == " Barcelona",]
Data_Barcelona <-merge(Weather_Barcelona, energy_raw[,c("time","Barcelona")], by.x = "dt_iso", by.y = "time")
colnames(Data_Barcelona)[colnames(Data_Barcelona) == "Barcelona"] <- "energy"
# Bilbao
Weather_Bilbao <- weather[weather$city_name == 'Bilbao',]
Data_Bilbao <-merge(Weather_Bilbao, energy_raw[,c("time","Bilboa")], by.x = "dt_iso", by.y = "time")
colnames(Data_Bilbao)[colnames(Data_Bilbao) == "Bilboa"] <- "energy"
# Seville
Weather_Seville <- weather[weather$city_name == 'Seville',]
Data_Seville <-merge(Weather_Seville, energy_raw[,c("time","Seville")], by.x = "dt_iso", by.y = "time")
colnames(Data_Seville)[colnames(Data_Seville) == "Seville"] <- "energy"
# Madrid
Weather_Madrid <- weather[weather$city_name == 'Madrid',]
Data_Madrid <-merge(Weather_Madrid, energy_raw[,c("time","Madrid")], by.x = "dt_iso", by.y = "time")
colnames(Data_Madrid)[colnames(Data_Madrid) == "Madrid"] <- "energy"
write.csv(Data_Barcelona,"Data/Barcelona.csv", row.names = TRUE)
write.csv(Data_Valencia,"Data/Valencia.csv", row.names = TRUE)
write.csv(Data_Bilbao,"Data/Bilbao.csv", row.names = TRUE)
write.csv(Data_Seville,"Data/Seville.csv", row.names = TRUE)
write.csv(Data_Madrid,"Data/Madrid.csv", row.names = TRUE)
# Data with all predictors and response variable
## Read raw data: Barcelona
Barcelona_raw<-read.csv(file = "Data/Barcelona.csv")
# Rename column names
colnames(Barcelona_raw)[colnames(Barcelona_raw) == "X"] <- "DataID"
colnames(Barcelona_raw)[colnames(Barcelona_raw) == "dt_iso"] <- "time"
# Select only the columns of interest
Barcelona_Data<-Barcelona_raw %>%
dplyr::select(DataID,time,temp,humidity,pressure,wind_speed,rain_1h,rain_3h,snow_3h,weather_main,energy)
print(unique(Barcelona_Data$weather_main))
[1] "clear" "clouds" "rain" "snow" "drizzle" "thunderstorm" "mist" "fog" "dust"
Barcelona_Data<-transform(Barcelona_Data, weather_main = factor(weather_main,
levels = c("clear", "clouds", "drizzle","dust","fog","haze","mist","rain","smoke","snow","squall","thunderstorm"),
labels = c(1:12)))
Weather_description <- data.frame (Weather_Description = c("clear", "clouds", "drizzle",
"dust","fog","haze","mist","rain","smoke","snow","squall","thunderstorm"),
Index = c(1:12)
)
print(Weather_description)
# Add two columns of rain duration
Barcelona_Data["rain_duration"] <- Barcelona_Data$rain_1h + Barcelona_Data$rain_3h
# Rename Column Names for clarity
Barcelona_Data <-merge(Barcelona_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)")], by.x = "time", by.y = "time")
colnames(Barcelona_Data )[colnames(Barcelona_Data ) == "Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)"] <- "time_band"
Barcelona_Data <-merge(Barcelona_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)")], by.x = "time", by.y = "time")
colnames(Barcelona_Data )[colnames(Barcelona_Data ) == "Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)"] <- "season"
Barcelona_Data <-merge(Barcelona_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime of day (Day vs Night)")], by.x = "time", by.y = "time")
colnames(Barcelona_Data )[colnames(Barcelona_Data ) == "Specified Categorical Variable:\r\nTime of day (Day vs Night)"] <- "day_night"
colnames(Barcelona_Data )[colnames(Barcelona_Data ) == "snow_3h"] <- "snow_duration"
Barcelona_Data<-subset(Barcelona_Data,select=-c(rain_1h,rain_3h,DataID))
colnames(Barcelona_Data )[colnames(Barcelona_Data ) == "time"] <- "time_ID"
Barcelona_Data <- Barcelona_Data [, c("time_ID","temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy")]
colnames(Barcelona_Data )[colnames(Barcelona_Data ) == "energy"] <- "energy_demand"
Reference to average weather data for Barcelona
Barcelona_Data[Barcelona_Data$temp < 270 ,]
Barcelona_Data<- subset(Barcelona_Data, temp>270) # remove temperature less than 270K
Barcelona_Data<- subset(Barcelona_Data, pressure >900 & pressure <1050) # remove pressure outside [900,1050]mbar
Barcelona_Data<-subset(Barcelona_Data, energy_demand >10) # Remove all 0 demand from the data
Barcelona.rain_duration <- ggplot(Barcelona_Data) +
geom_point( aes(c(1:length(rain_duration)),rain_duration)) +
labs(subtitle="Rain Duration for Barcelona",
y="rain duration",
x="# data point")
Barcelona.humidity <- ggplot(Barcelona_Data) +
geom_point( aes(c(1:length(humidity)),humidity)) +
labs(subtitle="Humidity for Barcelona",
y="humidity",
x="# data point")
Barcelona.wind_speed <- ggplot(Barcelona_Data) +
geom_point( aes(c(1:length(wind_speed)),wind_speed)) +
labs(subtitle="Wind Speed for Barcelona",
y="wind_speed",
x="# data point")
Barcelona.pressure <- ggplot(Barcelona_Data) +
geom_point( aes(c(1:length(pressure)),pressure)) +
labs(subtitle="Pressure of Barcelona",
y="pressure",
x="# data point")
ggarrange(Barcelona.rain_duration,Barcelona.humidity,Barcelona.wind_speed,Barcelona.pressure,
labels = c("(A)", "(B)", "(C)","(D)"),
ncol = 2, nrow = 2)
FullModel <- lm(energy_demand ~ temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = Barcelona_Data)
summary(FullModel)$adj.r.squared
[1] 0.3125999
Adjusted R2 of the model is very low 0.3127159.
We perform Autocorrelation and Partial Autocorrelation to incorporate temporal variability/influence in the model.
# Opening the graphical device
# Customizing the output
pdf("acfpcf.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow = c(1, 2))
Barcelona.acr <- acf(Barcelona_Data$energy_demand,plot = FALSE)
plot(Barcelona.acr, main = "Energy Demand of Barcelona", res=600, cex.main = 3)
grid(10,10)
Barcelona.pacr<- pacf(Barcelona_Data$energy_demand,plot =FALSE)
plot(Barcelona.pacr, main = "Energy Demand of Barcelona",res=600, cex.main = 3)
grid(10,10)
# Closing the graphical device
dev.off()
null device
1
pacf_values <-as.data.frame(Barcelona.pacr$acf)
pacf_values_detect<-subset(pacf_values, abs(pacf_values) >0.25)
head(pacf_values_detect)
# Check the highest partially auto correlated energy demand in order
idx<-order(abs(pacf_values_detect$V1),decreasing = TRUE)
pacf_values_detect<- pacf_values_detect[order(abs(pacf_values_detect$V1),decreasing = TRUE),]
E_1<-Barcelona_Data$energy_demand[-1] # Get all the data except in first row
Barcelona_Data<-Barcelona_Data[-nrow(Barcelona_Data),]
Barcelona_Data$E_1 <- E_1
E_2<-E_1[-1]
Barcelona_Data<-Barcelona_Data[-nrow(Barcelona_Data),]
Barcelona_Data$E_2 <- E_2
E_25<-Barcelona_Data$energy_demand[25:nrow(Barcelona_Data)] # Get all the data except in first row
Barcelona_Data<-Barcelona_Data[1:(nrow(Barcelona_Data)-24),]
Barcelona_Data$E_25 <- E_25
FullModel1 <- lm(energy_demand ~ E_1 + E_2 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = Barcelona_Data)
summary(FullModel1)
Call:
lm(formula = energy_demand ~ E_1 + E_2 + E_25 + temp + humidity +
pressure + wind_speed + rain_duration + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main),
data = Barcelona_Data)
Residuals:
Min 1Q Median 3Q Max
-1609.24 -50.99 8.82 62.86 2529.96
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -443.913651 95.856207 -4.631 0.0000036514802 ***
E_1 1.602696 0.004053 395.472 < 0.0000000000000002 ***
E_2 -0.714152 0.003737 -191.080 < 0.0000000000000002 ***
E_25 0.030901 0.001588 19.454 < 0.0000000000000002 ***
temp 2.564276 0.162888 15.743 < 0.0000000000000002 ***
humidity -0.476128 0.041091 -11.587 < 0.0000000000000002 ***
pressure 0.008830 0.085399 0.103 0.917648
wind_speed 0.503034 0.320373 1.570 0.116389
rain_duration 0.088170 1.063993 0.083 0.933957
factor(day_night)2 6.985750 1.654274 4.223 0.0000241842532 ***
factor(time_band)2 26.676131 7.987014 3.340 0.000839 ***
factor(time_band)3 14.395365 6.215916 2.316 0.020570 *
factor(season)2 -15.109435 2.308903 -6.544 0.0000000000607 ***
factor(season)3 -2.076330 1.806323 -1.149 0.250367
factor(season)4 22.721483 1.974512 11.507 < 0.0000000000000002 ***
factor(weather_main)2 -0.735382 1.323482 -0.556 0.578459
factor(weather_main)3 -2.926329 8.477791 -0.345 0.729964
factor(weather_main)4 181.549670 79.215696 2.292 0.021921 *
factor(weather_main)5 0.717756 14.634756 0.049 0.960884
factor(weather_main)7 -15.212228 6.012154 -2.530 0.011403 *
factor(weather_main)8 6.053918 2.536987 2.386 0.017026 *
factor(weather_main)10 105.739970 30.017678 3.523 0.000428 ***
factor(weather_main)12 2.309079 7.752191 0.298 0.765811
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 112 on 34930 degrees of freedom
Multiple R-squared: 0.9591, Adjusted R-squared: 0.9591
F-statistic: 3.724e+04 on 22 and 34930 DF, p-value: < 0.00000000000000022
Barcelona_Data <- Barcelona_Data [, c("time_ID", "E_1","E_2","E_25", "temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy_demand")]
set.seed(501)
working_rows <- sample(1:nrow(Barcelona_Data),0.80*nrow(Barcelona_Data))
working_Barcelona <-Barcelona_Data[working_rows,]
working_Barcelona$index <- as.numeric(row.names(working_Barcelona))
working_Barcelona<- working_Barcelona[order(working_Barcelona$index), ]
working_Barcelona<-working_Barcelona %>%
relocate(index)
evaluation_Barcelona <-Barcelona_Data[-working_rows,]
evaluation_Barcelona$index <- as.numeric(row.names(evaluation_Barcelona))
evaluation_Barcelona<- evaluation_Barcelona[order(evaluation_Barcelona$index), ]
evaluation_Barcelona<-evaluation_Barcelona %>%
relocate(index)
write.csv(working_Barcelona,"Data/working_Barcelona.csv", row.names = FALSE)
write.csv(evaluation_Barcelona,"Data/evaluation_Barcelona.csv", row.names = FALSE)
Valencia_raw<-read.csv(file = "Data/Valencia.csv")
colnames(Valencia_raw)[colnames(Valencia_raw) == "X"] <- "DataID"
colnames(Valencia_raw)[colnames(Valencia_raw) == "dt_iso"] <- "time"
Valencia_Data<-Valencia_raw %>%
dplyr::select(DataID,time,temp,humidity,pressure,wind_speed,rain_1h,rain_3h,snow_3h,weather_main,energy)
Valencia_Data<-transform(Valencia_Data, weather_main = factor(weather_main,
levels = c("clear", "clouds", "drizzle","dust","fog","haze","mist","rain","smoke","snow","squall","thunderstorm"),
labels = c(1:12)))
# Add two columns of rain duration
Valencia_Data["rain_duration"] <- Valencia_Data$rain_1h + Valencia_Data$rain_3h
# Rename Column Names for clarity
Valencia_Data <-merge(Valencia_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)")], by.x = "time", by.y = "time")
colnames(Valencia_Data )[colnames(Valencia_Data ) == "Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)"] <- "time_band"
Valencia_Data <-merge(Valencia_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)")], by.x = "time", by.y = "time")
colnames(Valencia_Data )[colnames(Valencia_Data ) == "Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)"] <- "season"
Valencia_Data <-merge(Valencia_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime of day (Day vs Night)")], by.x = "time", by.y = "time")
colnames(Valencia_Data )[colnames(Valencia_Data ) == "Specified Categorical Variable:\r\nTime of day (Day vs Night)"] <- "day_night"
colnames(Valencia_Data )[colnames(Valencia_Data ) == "snow_3h"] <- "snow_duration"
Valencia_Data<-subset(Valencia_Data,select=-c(rain_1h,rain_3h,DataID))
colnames(Valencia_Data )[colnames(Valencia_Data ) == "time"] <- "time_ID"
Valencia_Data <- Valencia_Data [, c("time_ID","temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy")]
colnames(Valencia_Data )[colnames(Valencia_Data ) == "energy"] <- "energy_demand"
VD_<- Valencia_Data[Valencia_Data$temp < 270 ,]
Valencia_Data<- subset(Valencia_Data, temp>270) # remove temperature less than 270K
Valencia_Data<- subset(Valencia_Data, pressure >900 & pressure <1050) # remove pressure outside [900,1050]mbar
Valencia_Data<-subset(Valencia_Data, energy_demand >10) # Remove all 0 demand from the data
Valencia.rain_duration <- ggplot(Valencia_Data) +
geom_point( aes(c(1:length(rain_duration)),rain_duration)) +
labs(subtitle="Rain Duration for Valencia",
y="rain duration",
x="# data point")
Valencia.humidity <- ggplot(Valencia_Data) +
geom_point( aes(c(1:length(humidity)),humidity)) +
labs(subtitle="Humidity for Valencia",
y="humidity",
x="# data point")
Valencia.wind_speed <- ggplot(Valencia_Data) +
geom_point( aes(c(1:length(wind_speed)),wind_speed)) +
labs(subtitle="Wind Speed for Valencia",
y="wind_speed",
x="# data point")
Valencia.pressure <- ggplot(Valencia_Data) +
geom_point( aes(c(1:length(pressure)),pressure)) +
labs(subtitle="Pressure of Valencia",
y="pressure",
x="# data point")
ggarrange(Valencia.rain_duration,Valencia.humidity,Valencia.wind_speed,Valencia.pressure,
labels = c("(A)", "(B)", "(C)","(D)"),
ncol = 2, nrow = 2)
E_1<-Valencia_Data$energy_demand[-1] # Get all the data except in first row
Valencia_Data<-Valencia_Data[-nrow(Valencia_Data),]
Valencia_Data$E_1 <- E_1
E_2<-E_1[-1]
Valencia_Data<-Valencia_Data[-nrow(Valencia_Data),]
Valencia_Data$E_2 <- E_2
E_25<-Valencia_Data$energy_demand[25:nrow(Valencia_Data)] # Get all the data except in first row
Valencia_Data<-Valencia_Data[1:(nrow(Valencia_Data)-24),]
Valencia_Data$E_25 <- E_25
Valencia_Data <- Valencia_Data [, c("time_ID", "E_1","E_2","E_25", "temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy_demand")]
### Generate Working and Evaluation Data
set.seed(501)
working_rows <- sample(1:nrow(Valencia_Data),0.80*nrow(Valencia_Data))
working_Valencia <-Valencia_Data[working_rows,]
working_Valencia$index <- as.numeric(row.names(working_Valencia))
working_Valencia<- working_Valencia[order(working_Valencia$index), ]
working_Valencia<-working_Valencia %>%
relocate(index)
evaluation_Valencia <-Valencia_Data[-working_rows,]
evaluation_Valencia$index <- as.numeric(row.names(evaluation_Valencia))
evaluation_Valencia<- evaluation_Valencia[order(evaluation_Valencia$index), ]
evaluation_Valencia<-evaluation_Valencia %>%
relocate(index)
write.csv(working_Valencia,"Data/working_Valencia.csv", row.names = FALSE)
write.csv(evaluation_Valencia,"Data/evaluation_Valencia.csv", row.names = FALSE)
Bilbao_raw<-read.csv(file = "Data/Bilbao.csv")
colnames(Bilbao_raw)[colnames(Bilbao_raw) == "X"] <- "DataID"
colnames(Bilbao_raw)[colnames(Bilbao_raw) == "dt_iso"] <- "time"
Bilbao_Data<-Bilbao_raw %>%
dplyr::select(DataID,time,temp,humidity,pressure,wind_speed,rain_1h,rain_3h,snow_3h,weather_main,energy)
Bilbao_Data<-transform(Bilbao_Data, weather_main = factor(weather_main,
levels = c("clear", "clouds", "drizzle","dust","fog","haze","mist","rain","smoke","snow","squall","thunderstorm"),
labels = c(1:12)))
# Add two columns of rain duration
Bilbao_Data["rain_duration"] <- Bilbao_Data$rain_1h + Bilbao_Data$rain_3h
# Rename Column Names for clarity
Bilbao_Data <-merge(Bilbao_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)")], by.x = "time", by.y = "time")
colnames(Bilbao_Data )[colnames(Bilbao_Data ) == "Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)"] <- "time_band"
Bilbao_Data <-merge(Bilbao_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)")], by.x = "time", by.y = "time")
colnames(Bilbao_Data )[colnames(Bilbao_Data ) == "Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)"] <- "season"
Bilbao_Data <-merge(Bilbao_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime of day (Day vs Night)")], by.x = "time", by.y = "time")
colnames(Bilbao_Data )[colnames(Bilbao_Data ) == "Specified Categorical Variable:\r\nTime of day (Day vs Night)"] <- "day_night"
colnames(Bilbao_Data )[colnames(Bilbao_Data ) == "snow_3h"] <- "snow_duration"
Bilbao_Data<-subset(Bilbao_Data,select=-c(rain_1h,rain_3h,DataID))
colnames(Bilbao_Data )[colnames(Bilbao_Data ) == "time"] <- "time_ID"
Bilbao_Data <- Bilbao_Data [, c("time_ID","temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy")]
colnames(Bilbao_Data )[colnames(Bilbao_Data ) == "energy"] <- "energy_demand"
BD_<- Bilbao_Data[Bilbao_Data$temp < 270 ,]
Bilbao_Data<- subset(Bilbao_Data, temp>270) # remove temperature less than 270K
Bilbao_Data<- subset(Bilbao_Data, pressure >900 & pressure <1050) # remove pressure outside [900,1050]mbar
Bilbao_Data<-subset(Bilbao_Data, energy_demand >10) # Remove all 0 demand from the data
Bilbao.rain_duration <- ggplot(Bilbao_Data) +
geom_point( aes(c(1:length(rain_duration)),rain_duration)) +
labs(subtitle="Rain Duration for Bilbao",
y="rain duration",
x="# data point")
Bilbao.humidity <- ggplot(Bilbao_Data) +
geom_point( aes(c(1:length(humidity)),humidity)) +
labs(subtitle="Humidity for Bilbao",
y="humidity",
x="# data point")
Bilbao.wind_speed <- ggplot(Bilbao_Data) +
geom_point( aes(c(1:length(wind_speed)),wind_speed)) +
labs(subtitle="Wind Speed for Bilbao",
y="wind_speed",
x="# data point")
Bilbao.pressure <- ggplot(Bilbao_Data) +
geom_point( aes(c(1:length(pressure)),pressure)) +
labs(subtitle="Pressure of Bilbao",
y="pressure",
x="# data point")
ggarrange(Bilbao.rain_duration,Bilbao.humidity,Bilbao.wind_speed,Bilbao.pressure,
labels = c("(A)", "(B)", "(C)","(D)"),
ncol = 2, nrow = 2)
E_1<-Bilbao_Data$energy_demand[-1] # Get all the data except in first row
Bilbao_Data<-Bilbao_Data[-nrow(Bilbao_Data),]
Bilbao_Data$E_1 <- E_1
E_2<-E_1[-1]
Bilbao_Data<-Bilbao_Data[-nrow(Bilbao_Data),]
Bilbao_Data$E_2 <- E_2
E_25<-Bilbao_Data$energy_demand[25:nrow(Bilbao_Data)] # Get all the data except in first row
Bilbao_Data<-Bilbao_Data[1:(nrow(Bilbao_Data)-24),]
Bilbao_Data$E_25 <- E_25
Bilbao_Data <- Bilbao_Data [, c("time_ID", "E_1","E_2","E_25", "temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy_demand")]
### Generate Working and Evaluation Data
set.seed(501)
working_rows <- sample(1:nrow(Bilbao_Data),0.80*nrow(Bilbao_Data))
working_Bilbao <-Bilbao_Data[working_rows,]
working_Bilbao$index <- as.numeric(row.names(working_Bilbao))
working_Bilbao<- working_Bilbao[order(working_Bilbao$index), ]
working_Bilbao<-working_Bilbao %>%
relocate(index)
evaluation_Bilbao <-Bilbao_Data[-working_rows,]
evaluation_Bilbao$index <- as.numeric(row.names(evaluation_Bilbao))
evaluation_Bilbao<- evaluation_Bilbao[order(evaluation_Bilbao$index), ]
evaluation_Bilbao<-evaluation_Bilbao %>%
relocate(index)
write.csv(working_Bilbao,"Data/working_Bilbao.csv", row.names = FALSE)
write.csv(evaluation_Bilbao,"Data/evaluation_Bilbao.csv", row.names = FALSE)
Seville_raw<-read.csv(file = "Data/Seville.csv")
colnames(Seville_raw)[colnames(Seville_raw) == "X"] <- "DataID"
colnames(Seville_raw)[colnames(Seville_raw) == "dt_iso"] <- "time"
Seville_Data<-Seville_raw %>%
dplyr::select(DataID,time,temp,humidity,pressure,wind_speed,rain_1h,rain_3h,snow_3h,weather_main,energy)
Seville_Data<-transform(Seville_Data, weather_main = factor(weather_main,
levels = c("clear", "clouds", "drizzle","dust","fog","haze","mist","rain","smoke","snow","squall","thunderstorm"),
labels = c(1:12)))
# Add two columns of rain duration
Seville_Data["rain_duration"] <- Seville_Data$rain_1h + Seville_Data$rain_3h
# Rename Column Names for clarity
Seville_Data <-merge(Seville_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)")], by.x = "time", by.y = "time")
colnames(Seville_Data )[colnames(Seville_Data ) == "Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)"] <- "time_band"
Seville_Data <-merge(Seville_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)")], by.x = "time", by.y = "time")
colnames(Seville_Data )[colnames(Seville_Data ) == "Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)"] <- "season"
Seville_Data <-merge(Seville_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime of day (Day vs Night)")], by.x = "time", by.y = "time")
colnames(Seville_Data )[colnames(Seville_Data ) == "Specified Categorical Variable:\r\nTime of day (Day vs Night)"] <- "day_night"
colnames(Seville_Data )[colnames(Seville_Data ) == "snow_3h"] <- "snow_duration"
Seville_Data<-subset(Seville_Data,select=-c(rain_1h,rain_3h,DataID))
colnames(Seville_Data )[colnames(Seville_Data ) == "time"] <- "time_ID"
Seville_Data <- Seville_Data [, c("time_ID","temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy")]
colnames(Seville_Data )[colnames(Seville_Data ) == "energy"] <- "energy_demand"
SD_<-Seville_Data[Seville_Data$temp < 270 ,]
Seville_Data<- subset(Seville_Data, temp>270) # remove temperature less than 270K
Seville_Data<- subset(Seville_Data, pressure >900 & pressure <1050) # remove pressure outside [900,1050]mbar
Seville_Data<-subset(Seville_Data, energy_demand >10) # Remove all 0 demand from the data
Seville.rain_duration <- ggplot(Seville_Data) +
geom_point( aes(c(1:length(rain_duration)),rain_duration)) +
labs(subtitle="Rain Duration for Seville",
y="rain duration",
x="# data point")
Seville.humidity <- ggplot(Seville_Data) +
geom_point( aes(c(1:length(humidity)),humidity)) +
labs(subtitle="Humidity for Seville",
y="humidity",
x="# data point")
Seville.wind_speed <- ggplot(Seville_Data) +
geom_point( aes(c(1:length(wind_speed)),wind_speed)) +
labs(subtitle="Wind Speed for Seville",
y="wind_speed",
x="# data point")
Seville.pressure <- ggplot(Seville_Data) +
geom_point( aes(c(1:length(pressure)),pressure)) +
labs(subtitle="Pressure of Seville",
y="pressure",
x="# data point")
ggarrange(Seville.rain_duration,Seville.humidity,Seville.wind_speed,Seville.pressure,
labels = c("(A)", "(B)", "(C)","(D)"),
ncol = 2, nrow = 2)
E_1<-Seville_Data$energy_demand[-1] # Get all the data except in first row
Seville_Data<-Seville_Data[-nrow(Seville_Data),]
Seville_Data$E_1 <- E_1
E_2<-E_1[-1]
Seville_Data<-Seville_Data[-nrow(Seville_Data),]
Seville_Data$E_2 <- E_2
E_25<-Seville_Data$energy_demand[25:nrow(Seville_Data)] # Get all the data except in first row
Seville_Data<-Seville_Data[1:(nrow(Seville_Data)-24),]
Seville_Data$E_25 <- E_25
Seville_Data <- Seville_Data [, c("time_ID", "E_1","E_2","E_25", "temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy_demand")]
### Generate Working and Evaluation Data
set.seed(501)
working_rows <- sample(1:nrow(Seville_Data),0.80*nrow(Seville_Data))
working_Seville <-Seville_Data[working_rows,]
working_Seville$index <- as.numeric(row.names(working_Seville))
working_Seville<- working_Seville[order(working_Seville$index), ]
working_Seville<-working_Seville %>%
relocate(index)
evaluation_Seville <-Seville_Data[-working_rows,]
evaluation_Seville$index <- as.numeric(row.names(evaluation_Seville))
evaluation_Seville<- evaluation_Seville[order(evaluation_Seville$index), ]
evaluation_Seville<-evaluation_Seville %>%
relocate(index)
write.csv(working_Seville,"Data/working_Seville.csv", row.names = FALSE)
write.csv(evaluation_Seville,"Data/evaluation_Seville.csv", row.names = FALSE)
Madrid_raw<-read.csv(file = "Data/Madrid.csv")
colnames(Madrid_raw)[colnames(Madrid_raw) == "X"] <- "DataID"
colnames(Madrid_raw)[colnames(Madrid_raw) == "dt_iso"] <- "time"
Madrid_Data<-Madrid_raw %>%
dplyr::select(DataID,time,temp,humidity,pressure,wind_speed,rain_1h,rain_3h,snow_3h,weather_main,energy)
Madrid_Data<-transform(Madrid_Data, weather_main = factor(weather_main,
levels = c("clear", "clouds", "drizzle","dust","fog","haze","mist","rain","smoke","snow","squall","thunderstorm"),
labels = c(1:12)))
# Add two columns of rain duration
Madrid_Data["rain_duration"] <- Madrid_Data$rain_1h + Madrid_Data$rain_3h
# Rename Column Names for clarity
Madrid_Data <-merge(Madrid_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)")], by.x = "time", by.y = "time")
colnames(Madrid_Data )[colnames(Madrid_Data ) == "Specified Categorical Variable:\r\nTime Band (Peak, Off-Peak, Mid-Peak)"] <- "time_band"
Madrid_Data <-merge(Madrid_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)")], by.x = "time", by.y = "time")
colnames(Madrid_Data )[colnames(Madrid_Data ) == "Specified Categorical Variable:\r\nSeason (Spring, Summer, Autumn, Winter)"] <- "season"
Madrid_Data <-merge(Madrid_Data, energy_raw[,c("time","Specified Categorical Variable:\r\nTime of day (Day vs Night)")], by.x = "time", by.y = "time")
colnames(Madrid_Data )[colnames(Madrid_Data ) == "Specified Categorical Variable:\r\nTime of day (Day vs Night)"] <- "day_night"
colnames(Madrid_Data )[colnames(Madrid_Data ) == "snow_3h"] <- "snow_duration"
Madrid_Data<-subset(Madrid_Data,select=-c(rain_1h,rain_3h,DataID))
colnames(Madrid_Data )[colnames(Madrid_Data ) == "time"] <- "time_ID"
Madrid_Data <- Madrid_Data [, c("time_ID","temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy")]
colnames(Madrid_Data )[colnames(Madrid_Data ) == "energy"] <- "energy_demand"
MD_ <- Madrid_Data[Madrid_Data$temp < 270 ,]
Madrid_Data<- subset(Madrid_Data, temp>270) # remove temperature less than 270K
Madrid_Data<- subset(Madrid_Data, pressure >900 & pressure <1050) # remove pressure outside [900,1050]mbar
Madrid_Data<-subset(Madrid_Data, energy_demand >10) # Remove all 0 demand from the data
Madrid.rain_duration <- ggplot(Madrid_Data) +
geom_point( aes(c(1:length(rain_duration)),rain_duration)) +
labs(subtitle="Rain Duration for Madrid",
y="rain duration",
x="# data point")
Madrid.humidity <- ggplot(Madrid_Data) +
geom_point( aes(c(1:length(humidity)),humidity)) +
labs(subtitle="Humidity for Madrid",
y="humidity",
x="# data point")
Madrid.wind_speed <- ggplot(Madrid_Data) +
geom_point( aes(c(1:length(wind_speed)),wind_speed)) +
labs(subtitle="Wind Speed for Madrid",
y="wind_speed",
x="# data point")
Madrid.pressure <- ggplot(Madrid_Data) +
geom_point( aes(c(1:length(pressure)),pressure)) +
labs(subtitle="Pressure of Madrid",
y="pressure",
x="# data point")
ggarrange(Madrid.rain_duration,Madrid.humidity,Madrid.wind_speed,Madrid.pressure,
labels = c("(A)", "(B)", "(C)","(D)"),
ncol = 2, nrow = 2)
E_1<-Madrid_Data$energy_demand[-1] # Get all the data except in first row
Madrid_Data<-Madrid_Data[-nrow(Madrid_Data),]
Madrid_Data$E_1 <- E_1
E_2<-E_1[-1]
Madrid_Data<-Madrid_Data[-nrow(Madrid_Data),]
Madrid_Data$E_2 <- E_2
E_25<-Madrid_Data$energy_demand[25:nrow(Madrid_Data)] # Get all the data except in first row
Madrid_Data<-Madrid_Data[1:(nrow(Madrid_Data)-24),]
Madrid_Data$E_25 <- E_25
Madrid_Data <- Madrid_Data [, c("time_ID", "E_1","E_2","E_25", "temp","humidity","pressure","wind_speed","rain_duration","snow_duration","day_night","time_band","season","weather_main","energy_demand")]
### Generate Working and Evaluation Data
set.seed(501)
working_rows <- sample(1:nrow(Madrid_Data),0.80*nrow(Madrid_Data))
working_Madrid <-Madrid_Data[working_rows,]
working_Madrid$index <- as.numeric(row.names(working_Madrid))
working_Madrid<- working_Madrid[order(working_Madrid$index), ]
working_Madrid<-working_Madrid %>%
relocate(index)
evaluation_Madrid <-Madrid_Data[-working_rows,]
evaluation_Madrid$index <- as.numeric(row.names(evaluation_Madrid))
evaluation_Madrid<- evaluation_Madrid[order(evaluation_Madrid$index), ]
evaluation_Madrid<-evaluation_Madrid %>%
relocate(index)
write.csv(working_Madrid,"Data/working_Madrid.csv", row.names = FALSE)
write.csv(evaluation_Madrid,"Data/evaluation_Madrid.csv", row.names = FALSE)
evaldata<-read.csv('data/evaluation_Barcelona.csv')
workdata<-read.csv('data/working_Barcelona.csv')
#remove index, time_ID and snow_duration columns from "eval" analysis data frame
evalBarcelona<-evaldata[c(-1,-2,-11)]
#remove index, time_ID and snow_duration columns from "work" analysis data frame
workBarcelona<-workdata[c(-1,-2,-11)]
# head(workBarcelona, 4L)
# Correlation matrix for numeric predictors only (categorical predictors excluded)
workcor<-workBarcelona[c(-9:-12) ]
ggpairs(workcor)
cor(workBarcelona)
E_1 E_2 E_25 temp humidity pressure wind_speed rain_duration day_night time_band season
E_1 1.00000000 0.95043237 0.66580794 0.13878927 -0.28177561 -0.02781520 0.12785107 -0.01756328 -0.546196259 0.111967183 0.08088787
E_2 0.95043237 1.00000000 0.56952817 0.10748859 -0.24873502 -0.03648859 0.12975971 -0.01762592 -0.539309783 0.091879443 0.07961938
E_25 0.66580794 0.56952817 1.00000000 0.15876073 -0.31888324 -0.03315805 0.14094675 -0.01457261 -0.508755131 0.112525926 0.08003023
temp 0.13878927 0.10748859 0.15876073 1.00000000 -0.29083714 -0.08811130 0.08937600 0.03851107 -0.319483326 -0.103859527 -0.34479411
humidity -0.28177561 -0.24873502 -0.31888324 -0.29083714 1.00000000 0.10761304 -0.22672868 0.04008084 0.369843046 -0.097717527 0.04568793
pressure -0.02781520 -0.03648859 -0.03315805 -0.08811130 0.10761304 1.00000000 -0.12403082 -0.08243537 0.042578653 -0.030693135 0.22132029
wind_speed 0.12785107 0.12975971 0.14094675 0.08937600 -0.22672868 -0.12403082 1.00000000 0.03733744 -0.237455373 -0.000365960 -0.02069053
rain_duration -0.01756328 -0.01762592 -0.01457261 0.03851107 0.04008084 -0.08243537 0.03733744 1.00000000 -0.013813870 0.024856915 -0.03184723
day_night -0.54619626 -0.53930978 -0.50875513 -0.31948333 0.36984305 0.04257865 -0.23745537 -0.01381387 1.000000000 0.001813599 0.15078914
time_band 0.11196718 0.09187944 0.11252593 -0.10385953 -0.09771753 -0.03069313 -0.00036596 0.02485691 0.001813599 1.000000000 0.16363562
season 0.08088787 0.07961938 0.08003023 -0.34479411 0.04568793 0.22132029 -0.02069053 -0.03184723 0.150789145 0.163635620 1.00000000
weather_main -0.01894161 -0.01580890 -0.01977277 -0.03246087 0.18608532 -0.18441348 0.05154379 0.43511910 0.007496362 -0.012094920 -0.01586947
energy_demand 0.95024655 0.83107411 0.69947826 0.16573813 -0.30543266 -0.01938825 0.12361053 -0.01523665 -0.512051575 0.123752931 0.08143799
weather_main energy_demand
E_1 -0.018941608 0.95024655
E_2 -0.015808902 0.83107411
E_25 -0.019772771 0.69947826
temp -0.032460871 0.16573813
humidity 0.186085318 -0.30543266
pressure -0.184413485 -0.01938825
wind_speed 0.051543786 0.12361053
rain_duration 0.435119102 -0.01523665
day_night 0.007496362 -0.51205158
time_band -0.012094920 0.12375293
season -0.015869470 0.08143799
weather_main 1.000000000 -0.02147315
energy_demand -0.021473153 1.00000000
pdf("corplot.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
corrplot(cor(workcor), method="color", type="full", addCoef.col = "red", tl.col="black",number.cex = 0.75)
dev.off()
null device
1
corrplot(cor(workcor), method="color", type="full", addCoef.col = "red", tl.col="black",number.cex = 0.75)
workModel1 <- lm(energy_demand ~ E_1 + E_2 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona)
vif(workModel1)
GVIF Df GVIF^(1/(2*Df))
E_1 13.942127 1 3.733916
E_2 11.846657 1 3.441897
E_25 2.148160 1 1.465660
temp 3.375736 1 1.837318
humidity 1.444593 1 1.201912
pressure 1.163925 1 1.078854
wind_speed 1.140701 1 1.068036
rain_duration 1.350867 1 1.162268
factor(day_night) 1.913604 1 1.383331
factor(time_band) 1.102038 2 1.024588
factor(season) 3.546552 3 1.234907
factor(weather_main) 1.581208 8 1.029051
multicol_<-vif(workModel1)
print("VIF values greater than 10, implying serious multicollinearity, are:")
[1] "VIF values greater than 10, implying serious multicollinearity, are:"
multicol_[multicol_[,"GVIF"]>10,1]
E_1 E_2
13.94213 11.84666
According to the VIF output E_1 and E_2 variables have serious multicollinearity issue as thus remove E_2 and rebuild the model.
workModelFull <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona)
vif(workModelFull)
GVIF Df GVIF^(1/(2*Df))
E_1 2.049752 1 1.431696
E_25 1.962199 1 1.400785
temp 3.305833 1 1.818195
humidity 1.444100 1 1.201707
pressure 1.161911 1 1.077920
wind_speed 1.139121 1 1.067296
rain_duration 1.350681 1 1.162188
factor(day_night) 1.849385 1 1.359921
factor(time_band) 1.098891 2 1.023855
factor(season) 3.487460 3 1.231454
factor(weather_main) 1.580054 8 1.029004
According to the VIF output, we can observe that there’s no serious multicollinearity issue.
# to obtain residuals
res.fullmodel1 <- residuals(workModelFull)
# to obtain standardized residuals
std.res.fullmodel1 <- rstandard(workModelFull)
# to obtain fitted/predicted values
pred.fullmodel1 <- fitted.values(workModelFull)
par(mfrow=c(1,1))
qqnorm(y = std.res.fullmodel1, main = " Normal Q-Q Plot ",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(y = std.res.fullmodel1)
resplotdata1 <- data.frame(std.res.fullmodel1, pred.fullmodel1)
resbf1 <- lm(std.res.fullmodel1 ~ pred.fullmodel1, data = resplotdata1)
plot(x = pred.fullmodel1, y = std.res.fullmodel1, ylab = "Standardized Residuals", xlab = "Predicted Values", main = "Residuals Plot", col = ifelse(std.res.fullmodel1 < -3,"red",ifelse(std.res.fullmodel1 > 3,"red","black")))
abline(h = 0, col="blue", lty=1)
abline(resbf1, col="red", lty=3)
abline(h = 3, col="green", lty=3)
abline(h=-3, col="green", lty=3)
legend("bottomleft", legend=c("Best fit line of standardized residuals", "Horizontal line y = 0.0", "Horizontal line, y = +/- 3"), fill = c("red","blue","green"), cex = 1.0)
According to the Normal QQ plot, we can observe that most of points align on the reference line thus it follows the normal distribution assumption.
According to the residual plot, we can observe that there’s no heteroscedasticity issue , nevertheless, there are numerous outliers.
summary(workModelFull)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + rain_duration + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main), data = workBarcelona)
Residuals:
Min 1Q Median 3Q Max
-1531.74 -90.57 14.60 99.89 1412.55
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2803.519570 152.435690 -18.391 < 0.0000000000000002 ***
E_1 0.885257 0.002489 355.738 < 0.0000000000000002 ***
E_25 0.121752 0.002432 50.067 < 0.0000000000000002 ***
temp 7.170224 0.260203 27.556 < 0.0000000000000002 ***
humidity -0.634227 0.065866 -9.629 < 0.0000000000000002 ***
pressure 0.730545 0.136673 5.345 0.000000091017 ***
wind_speed -1.977760 0.514097 -3.847 0.00012 ***
rain_duration 2.422471 1.711138 1.416 0.15687
factor(day_night)2 65.503160 2.613422 25.064 < 0.0000000000000002 ***
factor(time_band)2 -18.244439 12.868697 -1.418 0.15628
factor(time_band)3 62.331534 10.004587 6.230 0.000000000472 ***
factor(season)2 -73.426137 3.686941 -19.915 < 0.0000000000000002 ***
factor(season)3 -29.983347 2.900109 -10.339 < 0.0000000000000002 ***
factor(season)4 17.277737 3.165117 5.459 0.000000048344 ***
factor(weather_main)2 -2.322696 2.122710 -1.094 0.27387
factor(weather_main)3 -13.355521 13.517978 -0.988 0.32317
factor(weather_main)4 156.628292 113.610051 1.379 0.16801
factor(weather_main)5 16.796013 22.805446 0.736 0.46144
factor(weather_main)7 -43.788142 9.722793 -4.504 0.000006706350 ***
factor(weather_main)8 13.107792 4.065549 3.224 0.00127 **
factor(weather_main)10 117.912863 48.565247 2.428 0.01519 *
factor(weather_main)12 3.124515 12.573762 0.248 0.80375
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 160.6 on 27940 degrees of freedom
Multiple R-squared: 0.9156, Adjusted R-squared: 0.9156
F-statistic: 1.444e+04 on 21 and 27940 DF, p-value: < 0.00000000000000022
df<- workModelFull %>%
tidy()
#Get the variables which there p-value larger than 0.05
df%>%
filter(df$p.value>0.05)
workModelPvalue <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona)
summary(workModelPvalue)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main), data = workBarcelona)
Residuals:
Min 1Q Median 3Q Max
-1532.03 -90.58 14.64 99.91 1412.13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2804.180567 152.437714 -18.396 < 0.0000000000000002 ***
E_1 0.885189 0.002488 355.770 < 0.0000000000000002 ***
E_25 0.121702 0.002432 50.051 < 0.0000000000000002 ***
temp 7.162242 0.260147 27.532 < 0.0000000000000002 ***
humidity -0.638104 0.065810 -9.696 < 0.0000000000000002 ***
pressure 0.734014 0.136653 5.371 0.000000078767 ***
wind_speed -1.962770 0.513997 -3.819 0.000134 ***
factor(day_night)2 65.425220 2.612889 25.039 < 0.0000000000000002 ***
factor(time_band)2 -17.154791 12.845890 -1.335 0.181746
factor(time_band)3 62.623435 10.002642 6.261 0.000000000389 ***
factor(season)2 -72.999238 3.674655 -19.866 < 0.0000000000000002 ***
factor(season)3 -29.911221 2.899713 -10.315 < 0.0000000000000002 ***
factor(season)4 17.237276 3.165045 5.446 0.000000051908 ***
factor(weather_main)2 -2.311979 2.122735 -1.089 0.276097
factor(weather_main)3 -13.139596 13.517361 -0.972 0.331032
factor(weather_main)4 156.902134 113.611928 1.381 0.167279
factor(weather_main)5 16.929897 22.805659 0.742 0.457878
factor(weather_main)7 -43.540310 9.721391 -4.479 0.000007535584 ***
factor(weather_main)8 15.811254 3.589221 4.405 0.000010607663 ***
factor(weather_main)10 118.504216 48.564323 2.440 0.014687 *
factor(weather_main)12 3.569918 12.570052 0.284 0.776411
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 160.6 on 27941 degrees of freedom
Multiple R-squared: 0.9156, Adjusted R-squared: 0.9156
F-statistic: 1.516e+04 on 20 and 27941 DF, p-value: < 0.00000000000000022
anova(workModelPvalue, workModelFull)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27941 720280919
2 27940 720229254 1 51664 2.0042 0.1569
At 5% significance level , no statistical evidence to support that the full model is preferred over than reduced model.(it’s simpler… no prediction performance is lost via the elimination of identified predictors). That is to say, workModelPvalue is preferred over workModelFull.
bestfits <- regsubsets(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona, nbest = 1)
plot(bestfits, scale="adjr2")
workModelBestfit<-lm(energy_demand ~ E_1 + E_25 + temp + humidity + factor(day_night) + factor(season) , data = workBarcelona)
summary(workModelBestfit)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + factor(day_night) +
factor(season), data = workBarcelona)
Residuals:
Min 1Q Median 3Q Max
-1543.34 -90.26 15.10 99.63 1400.13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2052.560979 76.260658 -26.915 < 0.0000000000000002 ***
E_1 0.885823 0.002487 356.180 < 0.0000000000000002 ***
E_25 0.122668 0.002431 50.459 < 0.0000000000000002 ***
temp 7.080417 0.257109 27.539 < 0.0000000000000002 ***
humidity -0.566084 0.063212 -8.955 < 0.0000000000000002 ***
factor(day_night)2 67.594434 2.564110 26.362 < 0.0000000000000002 ***
factor(season)2 -70.964302 3.649244 -19.446 < 0.0000000000000002 ***
factor(season)3 -27.809163 2.884814 -9.640 < 0.0000000000000002 ***
factor(season)4 21.157266 3.034653 6.972 0.0000000000032 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 160.9 on 27953 degrees of freedom
Multiple R-squared: 0.9152, Adjusted R-squared: 0.9152
F-statistic: 3.77e+04 on 8 and 27953 DF, p-value: < 0.00000000000000022
anova(workModelBestfit, workModelPvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + factor(day_night) +
factor(season)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27953 723907059
2 27941 720280919 12 3626141 11.722 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
At 5% significance level , no statistical evidence to support that the Bestfit model is preferred over than Pvalue model.(it’s simpler… no prediction performance is lost via the elimination of identified predictors). That is to say, workModelPvalue is preferred over workModelBestfit.
workNullModel <- lm(energy_demand ~ 1, data = workBarcelona)
step.working <- stepAIC(workNullModel, scope = list(lower = workNullModel,
upper = workModelFull), direction = "forward", trace=FALSE)
summary(step.working)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + factor(day_night) +
temp + factor(season) + humidity + factor(time_band) + factor(weather_main) +
pressure + wind_speed + rain_duration, data = workBarcelona)
Residuals:
Min 1Q Median 3Q Max
-1531.74 -90.57 14.60 99.89 1412.55
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2803.519570 152.435690 -18.391 < 0.0000000000000002 ***
E_1 0.885257 0.002489 355.738 < 0.0000000000000002 ***
E_25 0.121752 0.002432 50.067 < 0.0000000000000002 ***
factor(day_night)2 65.503160 2.613422 25.064 < 0.0000000000000002 ***
temp 7.170224 0.260203 27.556 < 0.0000000000000002 ***
factor(season)2 -73.426137 3.686941 -19.915 < 0.0000000000000002 ***
factor(season)3 -29.983347 2.900109 -10.339 < 0.0000000000000002 ***
factor(season)4 17.277737 3.165117 5.459 0.000000048344 ***
humidity -0.634227 0.065866 -9.629 < 0.0000000000000002 ***
factor(time_band)2 -18.244439 12.868697 -1.418 0.15628
factor(time_band)3 62.331534 10.004587 6.230 0.000000000472 ***
factor(weather_main)2 -2.322696 2.122710 -1.094 0.27387
factor(weather_main)3 -13.355521 13.517978 -0.988 0.32317
factor(weather_main)4 156.628292 113.610051 1.379 0.16801
factor(weather_main)5 16.796013 22.805446 0.736 0.46144
factor(weather_main)7 -43.788142 9.722793 -4.504 0.000006706350 ***
factor(weather_main)8 13.107792 4.065549 3.224 0.00127 **
factor(weather_main)10 117.912863 48.565247 2.428 0.01519 *
factor(weather_main)12 3.124515 12.573762 0.248 0.80375
pressure 0.730545 0.136673 5.345 0.000000091017 ***
wind_speed -1.977760 0.514097 -3.847 0.00012 ***
rain_duration 2.422471 1.711138 1.416 0.15687
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 160.6 on 27940 degrees of freedom
Multiple R-squared: 0.9156, Adjusted R-squared: 0.9156
F-statistic: 1.444e+04 on 21 and 27940 DF, p-value: < 0.00000000000000022
# Set seed for reproducibility in CV
set.seed(123)
# Set up repeated k-fold cross-validation, with k=10
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.cv.work <- train(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona, method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:11),
trControl = train.control)
step.cv.work$results
summary(step.cv.work$finalModel)
Subset selection object
21 Variables (and intercept)
Forced in Forced out
E_1 FALSE FALSE
E_25 FALSE FALSE
temp FALSE FALSE
humidity FALSE FALSE
pressure FALSE FALSE
wind_speed FALSE FALSE
rain_duration FALSE FALSE
factor(day_night)2 FALSE FALSE
factor(time_band)2 FALSE FALSE
factor(time_band)3 FALSE FALSE
factor(season)2 FALSE FALSE
factor(season)3 FALSE FALSE
factor(season)4 FALSE FALSE
factor(weather_main)2 FALSE FALSE
factor(weather_main)3 FALSE FALSE
factor(weather_main)4 FALSE FALSE
factor(weather_main)5 FALSE FALSE
factor(weather_main)7 FALSE FALSE
factor(weather_main)8 FALSE FALSE
factor(weather_main)10 FALSE FALSE
factor(weather_main)12 FALSE FALSE
1 subsets of each size up to 11
Selection Algorithm: backward
E_1 E_25 temp humidity pressure wind_speed rain_duration factor(day_night)2 factor(time_band)2 factor(time_band)3 factor(season)2
1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" "*" " " " " " " " " " " "*" " " " " " "
4 ( 1 ) "*" "*" "*" " " " " " " " " "*" " " " " " "
5 ( 1 ) "*" "*" "*" " " " " " " " " "*" " " " " "*"
6 ( 1 ) "*" "*" "*" " " " " " " " " "*" " " " " "*"
7 ( 1 ) "*" "*" "*" "*" " " " " " " "*" " " " " "*"
8 ( 1 ) "*" "*" "*" "*" " " " " " " "*" " " "*" "*"
9 ( 1 ) "*" "*" "*" "*" "*" " " " " "*" " " "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" " " " " "*" " " "*" "*"
11 ( 1 ) "*" "*" "*" "*" "*" " " " " "*" " " "*" "*"
factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)4 factor(weather_main)5 factor(weather_main)7
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " " " "
6 ( 1 ) "*" " " " " " " " " " " " "
7 ( 1 ) "*" " " " " " " " " " " " "
8 ( 1 ) "*" " " " " " " " " " " " "
9 ( 1 ) "*" " " " " " " " " " " " "
10 ( 1 ) "*" " " " " " " " " " " " "
11 ( 1 ) "*" "*" " " " " " " " " " "
factor(weather_main)8 factor(weather_main)10 factor(weather_main)12
1 ( 1 ) " " " " " "
2 ( 1 ) " " " " " "
3 ( 1 ) " " " " " "
4 ( 1 ) " " " " " "
5 ( 1 ) " " " " " "
6 ( 1 ) " " " " " "
7 ( 1 ) " " " " " "
8 ( 1 ) " " " " " "
9 ( 1 ) " " " " " "
10 ( 1 ) "*" " " " "
11 ( 1 ) "*" " " " "
workModelCross <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona)
anova( workModelCross, workModelPvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27942 720656824
2 27941 720280919 1 375905 14.582 0.0001345 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(workModelCross)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main), data = workBarcelona)
Residuals:
Min 1Q Median 3Q Max
-1539.30 -90.57 14.51 99.82 1403.40
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2842.315313 152.147211 -18.681 < 0.0000000000000002 ***
E_1 0.885397 0.002488 355.852 < 0.0000000000000002 ***
E_25 0.121666 0.002432 50.024 < 0.0000000000000002 ***
temp 7.085295 0.259428 27.311 < 0.0000000000000002 ***
humidity -0.607712 0.065343 -9.300 < 0.0000000000000002 ***
pressure 0.784867 0.136036 5.770 0.000000008032 ***
factor(day_night)2 66.722012 2.591356 25.748 < 0.0000000000000002 ***
factor(time_band)2 -15.310566 12.839927 -1.192 0.2331
factor(time_band)3 63.568928 10.002007 6.356 0.000000000211 ***
factor(season)2 -71.764888 3.661300 -19.601 < 0.0000000000000002 ***
factor(season)3 -29.174014 2.893983 -10.081 < 0.0000000000000002 ***
factor(season)4 16.333074 3.156942 5.174 0.000000231086 ***
factor(weather_main)2 -3.183514 2.110943 -1.508 0.1315
factor(weather_main)3 -13.335646 13.520548 -0.986 0.3240
factor(weather_main)4 151.166860 113.629607 1.330 0.1834
factor(weather_main)5 15.960942 22.809789 0.700 0.4841
factor(weather_main)7 -43.087845 9.723032 -4.432 0.000009392517 ***
factor(weather_main)8 14.635296 3.576855 4.092 0.000042948541 ***
factor(weather_main)10 120.377548 48.573646 2.478 0.0132 *
factor(weather_main)12 2.616717 12.570627 0.208 0.8351
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 160.6 on 27942 degrees of freedom
Multiple R-squared: 0.9156, Adjusted R-squared: 0.9155
F-statistic: 1.595e+04 on 19 and 27942 DF, p-value: < 0.00000000000000022
At 5% significance level , no statistical evidence to support that the Cross Validation model is preferred over than Pvalue model. (Latter is simpler… no prediction performance is lost via the elimination of identified predictors). That is to say, workModelPvalue is preferred over workModelCross
#ModelOLS <- ols_step_all_possible(workModelFull)
#cat("\n")
#plot(x = ModelOLS$n, y = ModelOLS$adjr)
#plot(x = ModelOLS$n, y = ModelOLS$aic)
#OLSsummary <- ModelOLS %>% group_by(n) %>% filter(adjr == max(adjr))
#OLSsummary
Based on Adjusted R-square, the most favorable OLS model is equivalent to workModelFull.
Method <- c('Full', 'P-value', 'BestFits', 'AIC Forward', 'Cross Validation','OLS_step')
Variables <- c(12,11,7,12,10,12)
Coefficients <- c(22,21,9,22,20,21)
Adjusted_R2 <- c(0.9155554,0.9155523,0.9151636,0.9155554,0.9155113,0.9155523)
BarcelonaTable<-data.frame(Method,Variables,Coefficients,Adjusted_R2)
# Summary table for different selection methods
print(BarcelonaTable)
#Alternate order
#ARSlist <- list(FullARS = c(summary(workModelFull)$adj.r.squared),
# PvalueARS = c(summary(workModelPvalue)$adj.r.squared),
# BestfitsARS = c(summary(workModelBestfit)$adj.r.squared),
# AICARS = c(summary(step.working)$adj.r.squared),
# CrossvARS = c(summary(workModelCross)$adj.r.squared),
# OLSARS = max(OLSsummary$adjr))
#ModelOrder <- as.data.frame(ARSlist) %>% pivot_longer(cols = 1:4) %>% arrange(desc(value))
#ModelOrder
Model Adjusted R-square from above methods (to 4 significant figures)
Full: 0.9156
P-value: 0.9156 (viewed to be the same as Full)
Bestfits: 0.9152
AIC Method: 0.9156
Cross-validation: 0.9155
OLS-step: 0.9156 (viewed to be the same as P-value)
Final model is the workModelPvalue as it has the highest Adjusted R-square and fewest predictors.
workFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBarcelona)
par(mfrow=c(2,2))
#Residual Plot
p1<-plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
#norm test
std.res <- rstandard(workFinal)
ad.test(std.res)
Anderson-Darling normality test
data: std.res
A = 122.77, p-value < 0.00000000000000022
p2<-qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
#leverage
lev<-hatvalues(workFinal)
cutlev = (2*length(coefficients(workFinal)))/nrow(workBarcelona)
# Count and assess the number of leverage values > cut-off
potoutlier <- sum(lev > cutlev, na.rm=TRUE)
totcount = length(fitted(workFinal))
print(paste("The number of leverage points that are potential outliers is", potoutlier,". This is", round(100*potoutlier/totcount,2),"% of the total number of predicted values, (", totcount,") thus not material."))
[1] "The number of leverage points that are potential outliers is 1143 . This is 4.09 % of the total number of predicted values, ( 27962 ) thus not material."
#barplot(lev, ylim = c(0, 2*cutlev))
p3<-plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Barcelona working set")
abline(h = cutlev, col = "red", lty=2)
#cook distance
cookdist<-cooks.distance(workFinal)
#barplot(cookdist, ylim=c(0,1.01), main = "Cook's Distance plot")
#barplot(cookdist, ylim=c(0,0.005), main = "Cook's Distance plot")
#plot(log(cookdist), type="h", ylim=c(0,1.01), main = "Cook's Distance plot")
#abline(h = 1, col = "red")
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.005,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
Prepare summary plot for report
# Opening the graphical device
# Customizing the output
pdf("BarcelonaAssumptionCheck1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
# par(mfrow=c(2,2))
#residuals scatter plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
grid(10,10)
dev.off()
null device
1
#normality plot
pdf("BarcelonaAssumptionCheck2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
grid(10,10)
dev.off()
null device
1
#leverage plot
pdf("BarcelonaAssumptionCheck3.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Barcelona working set")
abline(h = cutlev, col = "red", lty=2)
grid(10,10)
dev.off()
null device
1
#Cook's distance plot
pdf("BarcelonaAssumptionCheck4.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.005,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
dev.off()
null device
1
newdata <- evalBarcelona[ c(-2,-13)]
predict.eval <- predict(workFinal, newdata)
plot(x = evalBarcelona$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Barcelona")
abline(a=0, b=1, col="blue")
grid(10,10)
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("BarcelonaEvalActComp.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,1))
plot(x = evalBarcelona$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Barcelona", cex.lab = 0.75)
abline(a=0, b=1, col="blue")
grid(10,10)
# Closing the graphical device
dev.off()
null device
1
rootmse <- rmse(evaldata$energy_demand, predict.eval)
print(paste("The root mean square error of the predicted vs actual",nrow(evaldata),"hourly energy_data points for Barcelona over the 2015-2018 timespan is an energy demand difference of",round(rootmse,2),"MWh."))
[1] "The root mean square error of the predicted vs actual 6991 hourly energy_data points for Barcelona over the 2015-2018 timespan is an energy demand difference of 158.46 MWh."
cat("\n")
demandcompdf <- data.frame("Actual (MWh)" = evalBarcelona$energy_demand,"Predicted (MWh)" = predict.eval)
# head(demandcompdf,4L)
cat("\n")
print(paste("For example, the first actual demand value is",round(evaldata$energy_demand[1],2),"MWh and the corresponding predicted value is",round(predict.eval[1],2),"MWh. The percent error between this predicted value relative to the actual value is",round((100*(predict.eval[1] - evaldata$energy_demand[1])/evaldata$energy_demand[1]),2),"%."))
[1] "For example, the first actual demand value is 2949.49 MWh and the corresponding predicted value is 2779.19 MWh. The percent error between this predicted value relative to the actual value is -5.77 %."
cat("\n")
diffpct <- c((abs(evaldata$energy_demand - predict.eval))/evaldata$energy_demand)
print(paste("The mean difference between the",nrow(evaldata),"evaluation data set actual and model predicted values is",round(100*mean(diffpct),2),"%."))
[1] "The mean difference between the 6991 evaluation data set actual and model predicted values is 3.58 %."
Obtain X matrix from the model
Predictors <-rownames(summary(workFinal)$coefficients[,])
workpredictors <- as.data.frame(Predictors)
X <- model.matrix(workFinal)
class(X)
[1] "matrix" "array"
cat("\n")
dim(X)
[1] 27962 21
cat("\n")
#head(X, 4L)
evalFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity
+ pressure + wind_speed + factor(day_night)
+ factor(time_band) + factor(season)
+ factor(weather_main), data = evalBarcelona)
Predictors <-rownames(summary(evalFinal)$coefficients[,])
evalpredictors <- as.data.frame(Predictors)
x_new1 <- model.matrix(evalFinal)
cat("\n")
dim(x_new1)
[1] 6991 20
cat("\n")
#Check if there's a predictor difference between the work and eval matrices
workevalcompare <-anti_join(workpredictors,evalpredictors)
Joining, by = "Predictors"
cat("\n")
x_new_ <-as.data.frame(cbind(x_new1,rep(0,nrow(x_new1))))
dummy_xnew<- x_new_ %>% relocate(V21,.before = 16) %>% rename("factor(weather_main)4" = V21)
x_new <- as.matrix(dummy_xnew)
cat("\n")
class(x_new)
[1] "matrix" "array"
cat("\n")
dim(x_new)
[1] 6991 21
cat("\n")
# head(x_new, 4L)
h_new_mid <- solve(t(X)%*%X)
dim(h_new_mid)
[1] 21 21
h_new <- x_new%*%h_new_mid%*%t(x_new)
dim(h_new)
[1] 6991 6991
#cutlev2 = (2*length(coefficients(evalFinal)))/nrow(evalBarcelona)
# Count and assess the number of leverage values > cut-off
potoutlier2 <- sum(diag(h_new) > cutlev, na.rm=TRUE)
totcount2 = nrow(evalBarcelona)
plot(diag(h_new), type = "h", ylab = "Leverage for the validation data set", ylim = c(0,2*cutlev), main = "Barcelona")
abline(h=cutlev, col="red", lty=2)
print(paste("The number of leverage points that are potential outliers is", potoutlier2,". This is", round(100*potoutlier2/totcount2,2),"% of the total number of predicted values, (", totcount2,") thus not material."))
[1] "The number of leverage points that are potential outliers is 298 . This is 4.26 % of the total number of predicted values, ( 6991 ) thus not material."
par(mfrow=c(1,2))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working data set", main = "Barcelona")
abline(h = cutlev, col = "red", lty=2)
plot(diag(h_new), type = "h", ylim = c(0,2*cutlev), ylab = "Leverage for the evaluation data set", main = "Barcelona")
abline(h=cutlev, col="red", lty=2)
print(paste("The leverage cutoff exceedences in the evaluation set is",round(100*potoutlier2/totcount2,2),"% of the total number of observed values (", totcount2,") and is comparable to that of the working set,", round(100*potoutlier/totcount,2),"% of the total number of predicted values (", totcount,")."))
[1] "The leverage cutoff exceedences in the evaluation set is 4.26 % of the total number of observed values ( 6991 ) and is comparable to that of the working set, 4.09 % of the total number of predicted values ( 27962 )."
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("BarcelonaLevComp.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,2))
plot(lev, type="h", ylab = "Leverage", ylim = c(0,2*cutlev), yaxt = 'n', main = "Barcelona Working Data", res =600)
abline(h = cutlev, col = "red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
plot(diag(h_new), type = "h", ylab = "", ylim = c(0,2*cutlev), yaxt = 'n', main = "Barcelona Evaluation Data")
abline(h=cutlev, col="red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
# Closing the graphical device
dev.off()
null device
1
evaldata<-read.csv('data/evaluation_Bilbao.csv')
workdata<-read.csv('data/working_Bilbao.csv')
#remove index and time_ID columns from "eval" analysis data frame
evalBilbao<-evaldata[c(-1,-2)]
#remove index and time_ID columns from "work" analysis data frame
workBilbao<-workdata[c(-1,-2)]
# head(workBilbao, 4L)
# Correlation matrix for numeric predictors only (categorical predictors excluded)
workcor<-workBilbao[c(-10:-13)]
workcor_<-ggpairs(workcor)
corvalues<- cor(workBilbao)
# corplot2<- corrplot(cor(workcor), method="color", type="full", addCoef.col = "red", tl.col="black",number.cex = 0.75)
workModelBilbao1 <- lm(energy_demand ~ E_1 + E_2 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao)
vif_values <- vif(workModelBilbao1)
workModelBilbaoFull <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao)
vif_values2<- vif(workModelBilbaoFull)
# to obtain residuals
res.fullmodel1 <- residuals(workModelBilbaoFull)
# to obtain standardized residuals
std.res.fullmodel1 <- rstandard(workModelBilbaoFull)
# to obtain fitted/predicted values
pred.fullmodel1 <- fitted.values(workModelBilbaoFull)
par(mfrow=c(1,1))
qqnorm(y = std.res.fullmodel1, main = " Normal Q-Q Plot ",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(y = std.res.fullmodel1)
#residual plot
resplotdata1 <- data.frame(std.res.fullmodel1, pred.fullmodel1)
resbf1 <- lm(std.res.fullmodel1 ~ pred.fullmodel1, data = resplotdata1)
plot(x = pred.fullmodel1, y = std.res.fullmodel1, ylab = "Standardized Residuals", xlab = "Predicted Values", main = "Residuals Plot", col = ifelse(std.res.fullmodel1 < -3,"red",ifelse(std.res.fullmodel1 > 3,"red","black")))
abline(h = 0, col="blue", lty=1)
abline(resbf1, col="red", lty=3)
abline(h = 3, col="green", lty=3)
abline(h=-3, col="green", lty=3)
legend("bottomleft", legend=c("Best fit line of standardized residuals", "Horizontal line y = 0.0", "Horizontal line, y = +/- 3"), fill = c("red","blue","green"), cex = 1.0)
# summary(workModelBilbaoFull)
df<- workModelBilbaoFull %>%
tidy()
#Get the variables which there p-value larger than 0.05
df%>%
filter(df$p.value>0.05)
workModelBilbaoPvalue <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao)
#summary(workModelBilbaoPvalue)
anovacheck1<-anova(workModelBilbaoPvalue, workModelBilbaoFull)
bestfits <- regsubsets(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao, nbest = 1)
# plot(bestfits, scale="adjr2")
workModelBilbaoBestfit<-lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + rain_duration + factor(day_night) + factor(weather_main), data = workBilbao)
# summary(workModelBilbaoBestfit)
anovacheck2<-anova(workModelBilbaoBestfit, workModelBilbaoPvalue)
workNullModel<- lm(energy_demand ~ 1, data = workBilbao)
step.working <- stepAIC(workNullModel, scope = list(lower = workNullModel,
upper = workModelBilbaoFull), direction = "forward", trace=FALSE)
# summary(step.working)
Same reduced model as P-Value approach
# Set seed for reproducibility in CV
set.seed(123)
# Set up repeated k-fold cross-validation, with k=10
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.cv.work <- train(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao, method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:12),
trControl = train.control)
# step.cv.work$results
# summary(step.cv.work$finalModel)
workModelBilbaoCross <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao)
anovacheck3<-anova( workModelBilbaoCross, workModelBilbaoPvalue)
# summary(workModelBilbaoCross)
#ModelOLS <- ols_step_all_possible(workModelBilbaoFull)
#cat("\n")
#plot(x = ModelOLS$n, y = ModelOLS$adjr)
#plot(x = ModelOLS$n, y = ModelOLS$aic)
#OLSsummary <- ModelOLS %>% group_by(n) %>% filter(adjr == max(adjr))
#cat("\n")
#OLSsummary
Same reduced model as P-Value approach (thus commenting out given processing time)
Model adjusted R-square from above methods
Full: 0.9154
P-value: 0.9154
Bestfits: 0.9149
AIC Method: 0.9154
Cross-validation: 0.9154
OLS-step: 0.9154
Final model is the workModelBilbaoPvalue with highest Adjusted R-square and fewest predictors.
Method <- c('Full', 'P-value', 'BestFits', 'AIC Forward', 'Cross Validation','OLS_step')
Variables <- c(13,12,9,12,11,12)
Coefficients <- c(23,22,16,22,21,22)
Adjusted_R2 <- c(0.9153865,0.9153886,0.9148913,0.9153886,0.9153680,0.9153886)
BilbaoTable<-data.frame(Method,Variables,Coefficients,Adjusted_R2)
workFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workBilbao)
par(mfrow=c(2,2))
#Residual Plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
#norm test
std.res <- rstandard(workFinal)
ad.test(std.res)
Anderson-Darling normality test
data: std.res
A = 133, p-value < 0.00000000000000022
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
#leverage
lev<-hatvalues(workFinal)
cutlev = (2*length(coefficients(workFinal)))/nrow(workBilbao)
# Count and assess the number of leverage values > cut-off
potoutlier <- sum(lev > cutlev, na.rm=TRUE)
totcount = length(fitted(workFinal))
print(paste("The number of leverage points that are potential outliers is", potoutlier,". This is", round(100*potoutlier/totcount,2),"% of the total number of predicted values, (", totcount,") thus not material."))
[1] "The number of leverage points that are potential outliers is 1860 . This is 6.66 % of the total number of predicted values, ( 27936 ) thus not material."
#barplot(lev, ylim = c(0, 2*cutlev))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Bilbao working set")
abline(h = cutlev, col = "red", lty=2)
#cook distance
cookdist<-cooks.distance(workFinal)
#barplot(cookdist, ylim=c(0,1.01), main = "Cook's Distance plot")
#barplot(cookdist, ylim=c(0,0.005), main = "Cook's Distance plot")
#plot((cookdist), type="h", ylim=c(0,0.05), main = "Cook's Distance plot")
#abline(h = 1, col = "red")
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.015,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
Prepare summary plot for report
# Opening the graphical device
# Customizing the output
pdf("BilbaoAssumptionCheck1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
# par(mfrow=c(2,2))
#residuals scatter plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
grid(10,10)
dev.off()
null device
1
#normality plot
pdf("BilbaoAssumptionCheck2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
grid(10,10)
dev.off()
null device
1
#leverage plot
pdf("BilbaoAssumptionCheck3.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Barcelona working set")
abline(h = cutlev, col = "red", lty=2)
grid(10,10)
dev.off()
null device
1
#Cook's distance plot
pdf("BilbaoAssumptionCheck4.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.005,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
dev.off()
null device
1
newdata <- evalBilbao[ c(-2,-14)]
predict.eval <- predict(workFinal, newdata)
plot(x = evalBilbao$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Bilbao")
abline(a=0, b=1, col="blue")
grid(10,10)
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("BilbaoEvalActComp.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,1))
plot(x = evalBilbao$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Bilbao", cex.lab = 0.75)
abline(a=0, b=1, col="blue")
grid(10,10)
# Closing the graphical device
dev.off()
null device
1
rootmse <- rmse(evalBilbao$energy_demand, predict.eval)
print(paste("The root mean square error of the predicted vs actual",nrow(evaldata),"hourly energy_data points for Bilbao over the 2015-2018 timespan is an energy demand difference of",round(rootmse,2),"MWh."))
[1] "The root mean square error of the predicted vs actual 6985 hourly energy_data points for Bilbao over the 2015-2018 timespan is an energy demand difference of 61.98 MWh."
cat("\n")
demandcompdf <- data.frame("Actual (MWh)" = evalBilbao$energy_demand,"Predicted (MWh)" = predict.eval)
# head(demandcompdf,4L)
cat("\n")
print(paste("For example, the first actual demand value is",round(evaldata$energy_demand[1],2),"MWh and the corresponding predicted value is",round(predict.eval[1],2),"MWh. The percent error between this predicted value relative to the actual value is",round((100*(predict.eval[1] - evaldata$energy_demand[1])/evaldata$energy_demand[1]),2),"%."))
[1] "For example, the first actual demand value is 926.33 MWh and the corresponding predicted value is 927.72 MWh. The percent error between this predicted value relative to the actual value is 0.15 %."
cat("\n")
diffpct <- c((abs(evaldata$energy_demand - predict.eval))/evaldata$energy_demand)
print(paste("The mean difference between the",nrow(evaldata),"evaluation data set actual and model predicted values is",round(100*mean(diffpct),2),"%."))
[1] "The mean difference between the 6985 evaluation data set actual and model predicted values is 3.64 %."
Obtain X matrix from the model
Predictors <-rownames(summary(workFinal)$coefficients[,])
workpredictors <- as.data.frame(Predictors)
X <- model.matrix(workFinal)
class(X)
[1] "matrix" "array"
cat("\n")
dim(X)
[1] 27936 22
cat("\n")
# head(X, 4L)
Create x_new matrix using the validation data. Note that the purpose of running the evaluation regression here is to extract all of the correct predictor coefficients.
evalFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = evalBilbao)
Predictors <-rownames(summary(evalFinal)$coefficients[,])
evalpredictors <- as.data.frame(Predictors)
x_new1 <- model.matrix(evalFinal)
cat("\n")
dim(x_new1)
[1] 6985 22
cat("\n")
#Check if there's a predictor difference between the work and eval matrices
workevalcompare <-anti_join(workpredictors,evalpredictors)
Joining, by = "Predictors"
x_new <- x_new1
#x_new_ <-as.data.frame(cbind(x_new1,rep(0,nrow(x_new1))))
#dummy_xnew<- x_new_ %>% relocate(V21,.before = 16) %>% rename("factor(weather_main)4" = V21)
#x_new <- as.matrix(dummy_xnew)
cat("\n")
class(x_new)
[1] "matrix" "array"
cat("\n")
dim(x_new)
[1] 6985 22
cat("\n")
# head(x_new, 4L)
h_new_mid <- solve(t(X)%*%X)
dim(h_new_mid)
[1] 22 22
h_new <- x_new%*%h_new_mid%*%t(x_new)
dim(h_new)
[1] 6985 6985
Plot the hatvalues for the data in the evaluation set
#cutlev2 = (2*length(coefficients(evalFinal)))/nrow(evalBarcelona)
# Count and assess the number of leverage values > cut-off
potoutlier2 <- sum(diag(h_new) > cutlev, na.rm=TRUE)
totcount2 = nrow(evalBilbao)
# plot(diag(h_new), type = "h", ylab = "Leverage for the validation data set", ylim = c(0,2*cutlev), main = "Bilbao")
# abline(h=cutlev, col="red", lty=2)
print(paste("The number of leverage points that are potential outliers is", potoutlier2,". This is", round(100*potoutlier2/totcount2,2),"% of the total number of predicted values, (", totcount2,") thus not material."))
[1] "The number of leverage points that are potential outliers is 498 . This is 7.13 % of the total number of predicted values, ( 6985 ) thus not material."
Side-by-side comparision
par(mfrow=c(1,2))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working data set", main = "Barcelona")
abline(h = cutlev, col = "red", lty=2)
plot(diag(h_new), type = "h", ylim = c(0,2*cutlev), ylab = "Leverage for the evaluation data set", main = "Barcelona")
abline(h=cutlev, col="red", lty=2)
print(paste("The leverage cutoff exceedences in the evaluation set is",round(100*potoutlier2/totcount2,2),"% of the total number of observed values (", totcount2,") and is comparable to that of the working set,", round(100*potoutlier/totcount,2),"% of the total number of predicted values (", totcount,")."))
[1] "The leverage cutoff exceedences in the evaluation set is 7.13 % of the total number of observed values ( 6985 ) and is comparable to that of the working set, 6.66 % of the total number of predicted values ( 27936 )."
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("BilbaoLevComp.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,2))
plot(lev, type="h", ylab = "Leverage", ylim = c(0,2*cutlev), yaxt = 'n', main = "Bilbao Working Data", res =600)
abline(h = cutlev, col = "red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
plot(diag(h_new), type = "h", ylab = "", ylim = c(0,2*cutlev), yaxt = 'n', main = "Bilbao Evaluation Data")
abline(h=cutlev, col="red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
# Closing the graphical device
dev.off()
null device
1
evaldata<-read.csv('data/evaluation_Madrid.csv')
workdata<-read.csv('data/working_Madrid.csv')
#remove index and time_ID columns from "eval" analysis dataframe
evalMadrid<-evaldata[c(-1,-2)]
#remove index and time_ID columns from "work" analysis dataframe
workMadrid<-workdata[c(-1,-2)]
head(workMadrid, 4L)
# Correlation matrix for numeric predictors only (categorical predictors excluded)
workcor<-workMadrid[c(-10:-13) ]
ggpairs(workcor)
cor(workMadrid)
E_1 E_2 E_25 temp humidity pressure wind_speed rain_duration snow_duration day_night
E_1 1.0000000000 0.9499434705 0.6565882882 0.143391210 -0.156162266 0.0004020366 0.053524514 0.04305124 -0.0009123988 -0.542698752
E_2 0.9499434705 1.0000000000 0.5667056032 0.095664873 -0.104597491 0.0004102668 0.024935983 0.03279678 -0.0003432574 -0.535715015
E_25 0.6565882882 0.5667056032 1.0000000000 0.177936106 -0.211198630 -0.0003217271 0.083479995 0.03982243 0.0099683465 -0.511477237
temp 0.1433912096 0.0956648734 0.1779361065 1.000000000 -0.794717256 0.0239361080 0.069165992 -0.09436353 -0.0050214038 -0.310400429
humidity -0.1561622663 -0.1045974910 -0.2111986301 -0.794717256 1.000000000 -0.0934723236 -0.115914299 0.20892142 0.0034644595 0.296478951
pressure 0.0004020366 0.0004102668 -0.0003217271 0.023936108 -0.093472324 1.0000000000 -0.153619309 -0.08927302 -0.0018305651 0.010873667
wind_speed 0.0535245136 0.0249359835 0.0834799950 0.069165992 -0.115914299 -0.1536193090 1.000000000 0.13766407 0.0047321116 -0.112761329
rain_duration 0.0430512358 0.0327967771 0.0398224320 -0.094363530 0.208921418 -0.0892730225 0.137664073 1.00000000 0.0252627940 -0.039484943
snow_duration -0.0009123988 -0.0003432574 0.0099683465 -0.005021404 0.003464459 -0.0018305651 0.004732112 0.02526279 1.0000000000 -0.005716835
day_night -0.5426987515 -0.5357150149 -0.5114772370 -0.310400429 0.296478951 0.0108736668 -0.112761329 -0.03948494 -0.0057168354 1.000000000
time_band 0.1141809926 0.0968003546 0.0859565795 -0.125152565 0.086303385 -0.2942368366 0.045006948 0.00305527 -0.0007100628 -0.003141369
season 0.0832153797 0.0826213909 0.0824320712 -0.367294486 0.316990500 0.0677459079 -0.109874045 -0.05817062 -0.0079750205 0.143257400
weather_main 0.0445935510 0.0373754702 0.0311186482 -0.198277741 0.384072009 -0.1135259629 0.163858653 0.62217661 0.0174843840 -0.044672800
energy_demand 0.9499222286 0.8313872235 0.6847988867 0.184015883 -0.200387099 -0.0002102571 0.080783557 0.04958506 -0.0021141307 -0.509903410
time_band season weather_main energy_demand
E_1 0.1141809926 0.083215380 0.04459355 0.9499222286
E_2 0.0968003546 0.082621391 0.03737547 0.8313872235
E_25 0.0859565795 0.082432071 0.03111865 0.6847988867
temp -0.1251525653 -0.367294486 -0.19827774 0.1840158834
humidity 0.0863033852 0.316990500 0.38407201 -0.2003870994
pressure -0.2942368366 0.067745908 -0.11352596 -0.0002102571
wind_speed 0.0450069477 -0.109874045 0.16385865 0.0807835573
rain_duration 0.0030552696 -0.058170615 0.62217661 0.0495850577
snow_duration -0.0007100628 -0.007975021 0.01748438 -0.0021141307
day_night -0.0031413686 0.143257400 -0.04467280 -0.5099034101
time_band 1.0000000000 0.161864278 0.01049282 0.1256833010
season 0.1618642784 1.000000000 0.03949142 0.0841398021
weather_main 0.0104928162 0.039491421 1.00000000 0.0471215127
energy_demand 0.1256833010 0.084139802 0.04712151 1.0000000000
corrplot(cor(workcor), method="color", type="full", addCoef.col = "red", tl.col="black",number.cex = 0.75)
workModelMadrid1 <- lm(energy_demand ~ E_1 + E_2 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workMadrid)
vif(workModelMadrid1)
GVIF Df GVIF^(1/(2*Df))
E_1 13.683412 1 3.699110
E_2 11.842865 1 3.441346
E_25 2.066652 1 1.437586
temp 4.852956 1 2.202943
humidity 3.756247 1 1.938104
pressure 1.202259 1 1.096476
wind_speed 1.225852 1 1.107182
rain_duration 2.189530 1 1.479706
snow_duration 1.001029 1 1.000515
factor(day_night) 1.806033 1 1.343887
factor(time_band) 1.206976 2 1.048153
factor(season) 3.038317 3 1.203480
factor(weather_main) 3.422558 8 1.079933
workModelMadridFull <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workMadrid)
vif(workModelMadridFull)
GVIF Df GVIF^(1/(2*Df))
E_1 2.004683 1 1.415868
E_25 1.935699 1 1.391294
temp 4.775330 1 2.185253
humidity 3.745110 1 1.935229
pressure 1.201910 1 1.096316
wind_speed 1.223651 1 1.106187
rain_duration 2.189374 1 1.479653
snow_duration 1.001006 1 1.000503
factor(day_night) 1.732418 1 1.316213
factor(time_band) 1.202172 2 1.047108
factor(season) 2.970957 3 1.198991
factor(weather_main) 3.414880 8 1.079782
# to obtain residuals
res.fullmodel1 <- residuals(workModelMadridFull)
# to obtain standardized residuals
std.res.fullmodel1 <- rstandard(workModelMadridFull)
# to obtain fitted/predicted values
pred.fullmodel1 <- fitted.values(workModelMadridFull)
par(mfrow=c(1,1))
qqnorm(y = std.res.fullmodel1, main = " Normal Q-Q Plot ",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(y = std.res.fullmodel1)
#residual plot
resplotdata1 <- data.frame(std.res.fullmodel1, pred.fullmodel1)
resbf1 <- lm(std.res.fullmodel1 ~ pred.fullmodel1, data = resplotdata1)
plot(x = pred.fullmodel1, y = std.res.fullmodel1, ylab = "Standardized Residuals", xlab = "Predicted Values", main = "Residuals Plot", col = ifelse(std.res.fullmodel1 < -3,"red",ifelse(std.res.fullmodel1 > 3,"red","black")))
abline(h = 0, col="blue", lty=1)
abline(resbf1, col="red", lty=3)
abline(h = 3, col="green", lty=3)
abline(h=-3, col="green", lty=3)
legend("bottomleft", legend=c("Best fit line of standardized residuals", "Horizontal line y = 0.0", "Horizontal line, y = +/- 3"), fill = c("red","blue","green"), cex = 0.5)
summary(workModelMadridFull)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + rain_duration + snow_duration + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main),
data = workMadrid)
Residuals:
Min 1Q Median 3Q Max
-1097.14 -56.10 9.01 63.40 914.40
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1093.188294 59.986245 -18.224 < 0.0000000000000002 ***
E_1 0.892155 0.002464 362.140 < 0.0000000000000002 ***
E_25 0.103032 0.002424 42.513 < 0.0000000000000002 ***
temp 3.432903 0.150672 22.784 < 0.0000000000000002 ***
humidity -0.563259 0.049087 -11.475 < 0.0000000000000002 ***
pressure 0.121147 0.034840 3.477 0.000507 ***
wind_speed 2.491848 0.355546 7.009 0.00000000000246 ***
rain_duration 13.671539 4.609904 2.966 0.003023 **
snow_duration -124.613076 100.661587 -1.238 0.215749
factor(day_night)2 42.023414 1.669358 25.173 < 0.0000000000000002 ***
factor(time_band)2 8.612174 8.811459 0.977 0.328388
factor(time_band)3 74.552399 6.956330 10.717 < 0.0000000000000002 ***
factor(season)2 -48.870952 2.298821 -21.259 < 0.0000000000000002 ***
factor(season)3 -7.374987 1.870953 -3.942 0.00008105907428 ***
factor(season)4 25.722059 2.070951 12.420 < 0.0000000000000002 ***
factor(weather_main)2 8.527549 1.568031 5.438 0.00000005422072 ***
factor(weather_main)3 14.908620 6.279141 2.374 0.017589 *
factor(weather_main)5 4.942025 5.673912 0.871 0.383757
factor(weather_main)6 -7.593469 27.311364 -0.278 0.780989
factor(weather_main)7 18.121717 5.063404 3.579 0.000346 ***
factor(weather_main)8 27.082310 3.935399 6.882 0.00000000000604 ***
factor(weather_main)10 40.922832 18.759636 2.181 0.029160 *
factor(weather_main)12 32.889632 9.415102 3.493 0.000478 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 105.5 on 27740 degrees of freedom
Multiple R-squared: 0.9161, Adjusted R-squared: 0.916
F-statistic: 1.377e+04 on 22 and 27740 DF, p-value: < 0.00000000000000022
df<- workModelMadridFull %>%
tidy()
#Get the variables which there p-value larger than 0.05
df%>%
filter(df$p.value>0.05)
workModelMadridPvalue <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workMadrid)
summary(workModelMadridPvalue)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + rain_duration + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main), data = workMadrid)
Residuals:
Min 1Q Median 3Q Max
-1097.12 -56.13 9.00 63.40 914.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1093.581173 59.985981 -18.231 < 0.0000000000000002 ***
E_1 0.892186 0.002463 362.168 < 0.0000000000000002 ***
E_25 0.102993 0.002423 42.500 < 0.0000000000000002 ***
temp 3.434373 0.150669 22.794 < 0.0000000000000002 ***
humidity -0.562954 0.049087 -11.468 < 0.0000000000000002 ***
pressure 0.121104 0.034840 3.476 0.000510 ***
wind_speed 2.492271 0.355549 7.010 0.00000000000244 ***
rain_duration 13.595711 4.609541 2.949 0.003186 **
factor(day_night)2 42.030231 1.669365 25.177 < 0.0000000000000002 ***
factor(time_band)2 8.614108 8.811544 0.978 0.328284
factor(time_band)3 74.551969 6.956397 10.717 < 0.0000000000000002 ***
factor(season)2 -48.869024 2.298842 -21.258 < 0.0000000000000002 ***
factor(season)3 -7.363501 1.870948 -3.936 0.00008315458019 ***
factor(season)4 25.742089 2.070908 12.430 < 0.0000000000000002 ***
factor(weather_main)2 8.525961 1.568045 5.437 0.00000005454471 ***
factor(weather_main)3 14.917369 6.279198 2.376 0.017523 *
factor(weather_main)5 4.936703 5.673965 0.870 0.384274
factor(weather_main)6 -7.601158 27.311626 -0.278 0.780775
factor(weather_main)7 18.113599 5.063448 3.577 0.000348 ***
factor(weather_main)8 27.056170 3.935380 6.875 0.00000000000633 ***
factor(weather_main)10 40.938710 18.759812 2.182 0.029099 *
factor(weather_main)12 32.907720 9.415181 3.495 0.000474 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 105.5 on 27741 degrees of freedom
Multiple R-squared: 0.9161, Adjusted R-squared: 0.916
F-statistic: 1.442e+04 on 21 and 27741 DF, p-value: < 0.00000000000000022
anova(workModelMadridPvalue, workModelMadridFull)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + snow_duration + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27741 308998752
2 27740 308981682 1 17070 1.5325 0.2157
bestfits <- regsubsets(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workMadrid, nbest = 1)
plot(bestfits, scale="adjr2")
workModelMadridBestfit<-lm(energy_demand ~ E_1 + E_25 + temp + wind_speed + factor(day_night) + factor(time_band) + factor(season) , data = workMadrid)
summary(workModelMadridBestfit)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + wind_speed +
factor(day_night) + factor(time_band) + factor(season), data = workMadrid)
Residuals:
Min 1Q Median 3Q Max
-1084.09 -56.96 9.10 64.09 916.61
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1293.374957 32.392614 -39.928 < 0.0000000000000002 ***
E_1 0.892776 0.002471 361.305 < 0.0000000000000002 ***
E_25 0.106053 0.002420 43.822 < 0.0000000000000002 ***
temp 4.427351 0.112498 39.355 < 0.0000000000000002 ***
wind_speed 3.789287 0.332289 11.404 < 0.0000000000000002 ***
factor(day_night)2 40.399494 1.662907 24.294 < 0.0000000000000002 ***
factor(time_band)2 1.913481 8.661304 0.221 0.825
factor(time_band)3 67.719513 6.725624 10.069 < 0.0000000000000002 ***
factor(season)2 -52.144538 2.274335 -22.927 < 0.0000000000000002 ***
factor(season)3 -9.819047 1.859339 -5.281 0.000000129 ***
factor(season)4 23.902070 2.030660 11.771 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 106 on 27752 degrees of freedom
Multiple R-squared: 0.9154, Adjusted R-squared: 0.9154
F-statistic: 3.002e+04 on 10 and 27752 DF, p-value: < 0.00000000000000022
anova(workModelMadridBestfit, workModelMadridPvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + wind_speed + factor(day_night) +
factor(time_band) + factor(season)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27752 311591757
2 27741 308998752 11 2593005 21.163 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
workNullModel<- lm(energy_demand ~ 1, data = workMadrid)
step.working <- stepAIC(workNullModel, scope = list(lower = workNullModel,
upper = workModelMadridFull), direction = "forward", trace=FALSE)
summary(step.working)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + humidity + factor(day_night) +
factor(weather_main) + factor(season) + temp + factor(time_band) +
wind_speed + pressure + rain_duration, data = workMadrid)
Residuals:
Min 1Q Median 3Q Max
-1097.12 -56.13 9.00 63.40 914.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1093.581173 59.985981 -18.231 < 0.0000000000000002 ***
E_1 0.892186 0.002463 362.168 < 0.0000000000000002 ***
E_25 0.102993 0.002423 42.500 < 0.0000000000000002 ***
humidity -0.562954 0.049087 -11.468 < 0.0000000000000002 ***
factor(day_night)2 42.030231 1.669365 25.177 < 0.0000000000000002 ***
factor(weather_main)2 8.525961 1.568045 5.437 0.00000005454471 ***
factor(weather_main)3 14.917369 6.279198 2.376 0.017523 *
factor(weather_main)5 4.936703 5.673965 0.870 0.384274
factor(weather_main)6 -7.601158 27.311626 -0.278 0.780775
factor(weather_main)7 18.113599 5.063448 3.577 0.000348 ***
factor(weather_main)8 27.056170 3.935380 6.875 0.00000000000633 ***
factor(weather_main)10 40.938710 18.759812 2.182 0.029099 *
factor(weather_main)12 32.907720 9.415181 3.495 0.000474 ***
factor(season)2 -48.869024 2.298842 -21.258 < 0.0000000000000002 ***
factor(season)3 -7.363501 1.870948 -3.936 0.00008315458019 ***
factor(season)4 25.742089 2.070908 12.430 < 0.0000000000000002 ***
temp 3.434373 0.150669 22.794 < 0.0000000000000002 ***
factor(time_band)2 8.614108 8.811544 0.978 0.328284
factor(time_band)3 74.551969 6.956397 10.717 < 0.0000000000000002 ***
wind_speed 2.492271 0.355549 7.010 0.00000000000244 ***
pressure 0.121104 0.034840 3.476 0.000510 ***
rain_duration 13.595711 4.609541 2.949 0.003186 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 105.5 on 27741 degrees of freedom
Multiple R-squared: 0.9161, Adjusted R-squared: 0.916
F-statistic: 1.442e+04 on 21 and 27741 DF, p-value: < 0.00000000000000022
Same reduced model as P-value
# Set seed for reproducibility in CV
set.seed(123)
# Set up repeated k-fold cross-validation, with k=10
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.cv.work <- train(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workMadrid, method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:12),
trControl = train.control)
Reordering variables and trying again:
step.cv.work$results
summary(step.cv.work$finalModel)
Subset selection object
22 Variables (and intercept)
Forced in Forced out
E_1 FALSE FALSE
E_25 FALSE FALSE
temp FALSE FALSE
humidity FALSE FALSE
pressure FALSE FALSE
wind_speed FALSE FALSE
rain_duration FALSE FALSE
snow_duration FALSE FALSE
factor(day_night)2 FALSE FALSE
factor(time_band)2 FALSE FALSE
factor(time_band)3 FALSE FALSE
factor(season)2 FALSE FALSE
factor(season)3 FALSE FALSE
factor(season)4 FALSE FALSE
factor(weather_main)2 FALSE FALSE
factor(weather_main)3 FALSE FALSE
factor(weather_main)5 FALSE FALSE
factor(weather_main)6 FALSE FALSE
factor(weather_main)7 FALSE FALSE
factor(weather_main)8 FALSE FALSE
factor(weather_main)10 FALSE FALSE
factor(weather_main)12 FALSE FALSE
1 subsets of each size up to 12
Selection Algorithm: backward
E_1 E_25 temp humidity pressure wind_speed rain_duration snow_duration factor(day_night)2 factor(time_band)2 factor(time_band)3 factor(season)2
1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " " " " " "
4 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " " "
5 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " "*"
6 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " "*"
7 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " "*"
8 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" " " " " "*"
9 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" " " "*" "*"
10 ( 1 ) "*" "*" "*" "*" " " "*" " " " " "*" " " "*" "*"
11 ( 1 ) "*" "*" "*" "*" " " "*" " " " " "*" " " "*" "*"
12 ( 1 ) "*" "*" "*" "*" " " "*" "*" " " "*" " " "*" "*"
factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)5 factor(weather_main)6 factor(weather_main)7
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " " " "
6 ( 1 ) " " "*" " " " " " " " " " "
7 ( 1 ) " " "*" " " " " " " " " " "
8 ( 1 ) " " "*" " " " " " " " " " "
9 ( 1 ) " " "*" " " " " " " " " " "
10 ( 1 ) " " "*" " " " " " " " " " "
11 ( 1 ) " " "*" "*" " " " " " " " "
12 ( 1 ) " " "*" "*" " " " " " " " "
factor(weather_main)8 factor(weather_main)10 factor(weather_main)12
1 ( 1 ) " " " " " "
2 ( 1 ) " " " " " "
3 ( 1 ) " " " " " "
4 ( 1 ) " " " " " "
5 ( 1 ) " " " " " "
6 ( 1 ) " " " " " "
7 ( 1 ) "*" " " " "
8 ( 1 ) "*" " " " "
9 ( 1 ) "*" " " " "
10 ( 1 ) "*" " " " "
11 ( 1 ) "*" " " " "
12 ( 1 ) "*" " " " "
workModelMadridCross <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workMadrid)
anova( workModelMadridCross, workModelMadridPvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + wind_speed + rain_duration +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27742 309133337
2 27741 308998752 1 134585 12.083 0.0005097 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(workModelMadridCross)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main), data = workMadrid)
Residuals:
Min 1Q Median 3Q Max
-1097.70 -56.04 8.94 63.55 915.46
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -957.033754 45.343201 -21.106 < 0.0000000000000002 ***
E_1 0.892368 0.002463 362.251 < 0.0000000000000002 ***
E_25 0.102965 0.002424 42.480 < 0.0000000000000002 ***
temp 3.389425 0.150143 22.575 < 0.0000000000000002 ***
humidity -0.583777 0.048730 -11.980 < 0.0000000000000002 ***
wind_speed 2.360256 0.353585 6.675 0.0000000000252 ***
rain_duration 14.273068 4.606340 3.099 0.001947 **
factor(day_night)2 42.044434 1.669693 25.181 < 0.0000000000000002 ***
factor(time_band)2 2.621027 8.642947 0.303 0.761697
factor(time_band)3 68.228323 6.715646 10.160 < 0.0000000000000002 ***
factor(season)2 -48.456952 2.296242 -21.103 < 0.0000000000000002 ***
factor(season)3 -6.874955 1.866034 -3.684 0.000230 ***
factor(season)4 26.410474 2.062374 12.806 < 0.0000000000000002 ***
factor(weather_main)2 8.205598 1.565647 5.241 0.0000001608532 ***
factor(weather_main)3 14.948023 6.280446 2.380 0.017315 *
factor(weather_main)5 6.066888 5.665773 1.071 0.284270
factor(weather_main)6 -7.420466 27.317031 -0.272 0.785899
factor(weather_main)7 19.151012 5.055654 3.788 0.000152 ***
factor(weather_main)8 25.713836 3.917169 6.564 0.0000000000532 ***
factor(weather_main)10 40.421348 18.762968 2.154 0.031224 *
factor(weather_main)12 33.431946 9.415854 3.551 0.000385 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 105.6 on 27742 degrees of freedom
Multiple R-squared: 0.9161, Adjusted R-squared: 0.916
F-statistic: 1.514e+04 on 20 and 27742 DF, p-value: < 0.00000000000000022
#ModelOLS <- ols_step_all_possible(workModelMadridFull)
#cat("\n")
#plot(x = ModelOLS$n, y = ModelOLS$adjr)
#plot(x = ModelOLS$n, y = ModelOLS$aic)
#OLSsummary <- ModelOLS %>% group_by(n) %>% filter(adjr == max(adjr))
#OLSsummary
Same reduced model as P-Value approach (thus commenting out given processing time)
Model adjusted R-square from above methods (to four significant digits)
Full: 0.916
P-value: 0.916
Bestfits: 0.9154
AIC Method: 0.916
Cross-validation: 0.916
OLS-step: 0.916
Final model is the workModelMadridPvalue with highest Adjusted R-square and fewest predictors.
Method <- c('Full', 'P-value', 'BestFits', 'AIC Forward', 'Cross Validation')
Variables <- c(13,12,8,12,11)
Coefficients <- c(23,22,11,22,21)
Adjusted_R2 <- c(0.9160325,0.9160309,0.9153598,0.9160309,0.9159974)
MadridTable<-data.frame(Method,Variables,Coefficients,Adjusted_R2)
workFinal <- workModelMadridPvalue
par(mfrow=c(2,2))
#Residual Plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
#norm test
std.res <- rstandard(workFinal)
ad.test(std.res)
Anderson-Darling normality test
data: std.res
A = 147.61, p-value < 0.00000000000000022
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
#leverage
lev<-hatvalues(workFinal)
cutlev = (2*length(coefficients(workFinal)))/nrow(workMadrid)
# Count and assess the number of leverage values > cut-off
potoutlier <- sum(lev > cutlev, na.rm=TRUE)
totcount = length(fitted(workFinal))
print(paste("The number of leverage points that are potential outliers is", potoutlier,". This is", round(100*potoutlier/totcount,2),"% of the total number of predicted values, (", totcount,") thus not material."))
[1] "The number of leverage points that are potential outliers is 1876 . This is 6.76 % of the total number of predicted values, ( 27763 ) thus not material."
#barplot(lev, ylim = c(0, 2*cutlev))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working set")
abline(h = cutlev, col = "red", lty=2)
#cook distance
cookdist<-cooks.distance(workFinal)
#barplot(cookdist, ylim=c(0,1.01), main = "Cook's Distance plot")
#abline(h = 1, col = "red")
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.01,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
# Opening the graphical device
# Customizing the output
pdf("MadridAssumptionCheck1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
# par(mfrow=c(2,2))
#residuals scatter plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
grid(10,10)
dev.off()
null device
1
#normality plot
pdf("MadridAssumptionCheck2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
grid(10,10)
dev.off()
null device
1
#leverage plot
pdf("MadridAssumptionCheck3.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Barcelona working set")
abline(h = cutlev, col = "red", lty=2)
grid(10,10)
dev.off()
null device
1
#Cook's distance plot
pdf("MadridAssumptionCheck4.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.005,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
dev.off()
null device
1
newdata <- evalMadrid[ c(-2,-14)]
predict.eval <- predict(workFinal, newdata)
plot(x = evalMadrid$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Madrid")
abline(a=0, b=1, col="blue")
grid(10,10)
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("MadridEvalActComp.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,1))
plot(x = evalMadrid$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Madrid", cex.lab = 0.75)
abline(a=0, b=1, col="blue")
grid(10,10)
# Closing the graphical device
dev.off()
null device
1
rootmse <- rmse(evalMadrid$energy_demand, predict.eval)
rootmse <- rmse(evaldata$energy_demand, predict.eval)
print(paste("The root mean square error of the predicted vs actual",nrow(evaldata),"hourly energy_data points for Madrid over the 2015-2018 timespan is an energy demand difference of",round(rootmse,2),"MWh."))
[1] "The root mean square error of the predicted vs actual 6941 hourly energy_data points for Madrid over the 2015-2018 timespan is an energy demand difference of 107.04 MWh."
cat("\n")
demandcompdf <- data.frame("Actual (MWh)" = evalMadrid$energy_demand,"Predicted (MWh)" = predict.eval)
head(demandcompdf,4L)
cat("\n")
print(paste("For example, the first actual demand value is",round(evaldata$energy_demand[1],2),"MWh and the corresponding predicted value is",round(predict.eval[1],2),"MWh. The percent error between this predicted value relative to the actual value is",round((100*(predict.eval[1] - evaldata$energy_demand[1])/evaldata$energy_demand[1]),2),"%."))
[1] "For example, the first actual demand value is 1920.23 MWh and the corresponding predicted value is 2030.71 MWh. The percent error between this predicted value relative to the actual value is 5.75 %."
cat("\n")
diffpct <- c((abs(evaldata$energy_demand - predict.eval))/evaldata$energy_demand)
print(paste("The mean difference between the",nrow(evaldata),"evaluation data set actual and model predicted values is",round(100*mean(diffpct),2),"%."))
[1] "The mean difference between the 6941 evaluation data set actual and model predicted values is 3.57 %."
Obtain X matrix from the model
Predictors <-rownames(summary(workFinal)$coefficients[,])
workpredictors <- as.data.frame(Predictors)
X <- model.matrix(workFinal)
class(X)
[1] "matrix" "array"
cat("\n")
dim(X)
[1] 27763 22
cat("\n")
head(X, 4L)
(Intercept) E_1 E_25 temp humidity pressure wind_speed rain_duration factor(day_night)2 factor(time_band)2 factor(time_band)3
1 1 1920.228 2132.596 281.5740 61 974 1 0 0 0 1
2 1 1963.115 2018.017 281.5740 61 974 1 0 0 0 1
3 1 1872.089 1970.913 282.5377 47 1036 1 0 0 1 0
4 1 1839.466 1949.509 283.3083 45 1035 1 0 0 1 0
factor(season)2 factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)5 factor(weather_main)6
1 0 0 1 0 0 0 0
2 0 0 1 0 0 0 0
3 0 0 1 0 0 0 0
4 0 0 1 0 0 0 0
factor(weather_main)7 factor(weather_main)8 factor(weather_main)10 factor(weather_main)12
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
Create x_new matrix using the validation data. Note that the purpose of running the evaluation regression here is to extract all of the correct predictor coefficients.
evalFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main), data = evalMadrid)
Predictors <-rownames(summary(evalFinal)$coefficients[,])
evalpredictors <- as.data.frame(Predictors)
x_new1 <- model.matrix(evalFinal)
cat("\n")
dim(x_new1)
[1] 6941 22
cat("\n")
#Check if there's a predictor difference between the work and eval matrices
workevalcompare <-anti_join(workpredictors,evalpredictors)
Joining, by = "Predictors"
cat("\n")
#If workevalcompare is 'No data available in table', run the next line of code and skip to line 816
x_new <- x_new1
#Otherwise when workevalcompare is not 'No data available in table', run the following lines of code after identifying which column is missing between X and x_new1 (no discrepancy was found between workMadrid but not in evalMadrid).
#x_new_ <-as.data.frame(cbind(x_new1,rep(0,nrow(x_new1))))
#dummy_xnew<- x_new_ %>% relocate(V21,.before = 16) %>% rename("factor(weather_main)4" = V21)
#x_new <- as.matrix(dummy_xnew)
cat("\n")
class(x_new)
[1] "matrix" "array"
cat("\n")
dim(x_new)
[1] 6941 22
cat("\n")
head(x_new, 4L)
(Intercept) E_1 E_25 temp humidity pressure wind_speed rain_duration factor(day_night)2 factor(time_band)2 factor(time_band)3
1 1 1966.377 2101.883 281.5740 61 974 1 0 0 0 1
2 1 2229.511 2362.708 277.4590 72 1036 1 0 1 0 1
3 1 2308.602 2377.031 274.9512 55 999 1 0 1 0 1
4 1 2311.068 2286.880 282.5917 57 1037 1 0 0 1 0
factor(season)2 factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)5 factor(weather_main)6
1 0 0 1 0 0 0 0
2 0 0 1 0 0 0 0
3 0 0 1 0 0 0 0
4 0 0 1 0 0 0 0
factor(weather_main)7 factor(weather_main)8 factor(weather_main)10 factor(weather_main)12
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
Compute the hat values for the validation data
h_new_mid <- solve(t(X)%*%X)
dim(h_new_mid)
[1] 22 22
h_new <- x_new%*%h_new_mid%*%t(x_new)
dim(h_new)
[1] 6941 6941
Plot the hatvalues for the data in the evaluation set
#cutlev2 = (2*length(coefficients(evalFinal)))/nrow(evalMadrid)
# Count and assess the number of leverage values > cut-off
potoutlier2 <- sum(diag(h_new) > cutlev, na.rm=TRUE)
totcount2 = nrow(evalMadrid)
plot(diag(h_new), type = "h", ylab = "Leverage for the validation data set", ylim = c(0,2*cutlev), main = "Madrid")
abline(h=cutlev, col="red", lty=2)
print(paste("The number of leverage points that are potential outliers is", potoutlier2,". This is", round(100*potoutlier2/totcount2,2),"% of the total number of predicted values, (", totcount2,") thus not material."))
[1] "The number of leverage points that are potential outliers is 438 . This is 6.31 % of the total number of predicted values, ( 6941 ) thus not material."
Side-by-side comparision
par(mfrow=c(1,2))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working data set", main = "Madrid")
abline(h = cutlev, col = "red", lty=2)
plot(diag(h_new), type = "h", ylim = c(0,2*cutlev), ylab = "Leverage for the evaluation data set", main = "Madrid")
abline(h=cutlev, col="red", lty=2)
print(paste("The leverage cutoff exceedences in the evaluation set is",round(100*potoutlier2/totcount2,2),"% of the total number of observed values (", totcount2,") and is comparable to that of the working set,", round(100*potoutlier/totcount,2),"% of the total number of predicted values (", totcount,")."))
[1] "The leverage cutoff exceedences in the evaluation set is 6.31 % of the total number of observed values ( 6941 ) and is comparable to that of the working set, 6.76 % of the total number of predicted values ( 27763 )."
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("MadridLevComp.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,2))
plot(lev, type="h", ylab = "Leverage", ylim = c(0,2*cutlev), yaxt = 'n', main = "Madrid Working Data", res =600)
abline(h = cutlev, col = "red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
plot(diag(h_new), type = "h", ylab = "", ylim = c(0,2*cutlev), yaxt = 'n', main = "Madrid Evaluation Data")
abline(h=cutlev, col="red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
# Closing the graphical device
dev.off()
null device
1
evaldata<-read.csv('data/evaluation_Seville.csv')
workdata<-read.csv('data/working_Seville.csv')
#remove index, time_ID and snow_duration columns from "eval" analysis dataframe
evalSeville<-evaldata[c(-1,-2,-11)]
#remove index, time_ID and snow_duration columns from "work" analysis dataframe
workSeville<-workdata[c(-1,-2,-11)]
head(workSeville, 4L)
# Correlation matrix for numeric predictors only (categorical predictors excluded)
workcor<-workSeville[c(-9:-12) ]
ggpairs(workcor)
cor(workSeville)
E_1 E_2 E_25 temp humidity pressure wind_speed rain_duration day_night time_band season
E_1 1.000000000 0.950854151 0.66638363 0.16537305 -0.23468751 -0.02311708 0.09616216 0.007157484 -0.542317356 0.111533226 0.08079501
E_2 0.950854151 1.000000000 0.57019030 0.11747631 -0.17392319 -0.02373721 0.04825869 0.007868531 -0.534651817 0.091257807 0.08074571
E_25 0.666383630 0.570190303 1.00000000 0.20343503 -0.28989939 -0.03106706 0.14126341 0.017526469 -0.510520679 0.112851400 0.07745398
temp 0.165373052 0.117476309 0.20343503 1.00000000 -0.73635552 -0.36378957 0.13787943 -0.099193770 -0.366474926 -0.147607803 -0.36184280
humidity -0.234687511 -0.173923187 -0.28989939 -0.73635552 1.00000000 0.17368694 -0.20591888 0.190535534 0.390079509 0.085354419 0.25243795
pressure -0.023117084 -0.023737213 -0.03106706 -0.36378957 0.17368694 1.00000000 -0.20855776 -0.187508867 0.068019733 0.188780617 0.41474625
wind_speed 0.096162157 0.048258690 0.14126341 0.13787943 -0.20591888 -0.20855776 1.00000000 0.121015082 -0.175701943 0.056228332 -0.08705692
rain_duration 0.007157484 0.007868531 0.01752647 -0.09919377 0.19053553 -0.18750887 0.12101508 1.000000000 -0.009750599 0.000193738 -0.04631036
day_night -0.542317356 -0.534651817 -0.51052068 -0.36647493 0.39007951 0.06801973 -0.17570194 -0.009750599 1.000000000 0.001903622 0.15151639
time_band 0.111533226 0.091257807 0.11285140 -0.14760780 0.08535442 0.18878062 0.05622833 0.000193738 0.001903622 1.000000000 0.16386623
season 0.080795009 0.080745711 0.07745398 -0.36184280 0.25243795 0.41474625 -0.08705692 -0.046310357 0.151516388 0.163866226 1.00000000
weather_main -0.004452486 0.011227066 -0.02136187 -0.19715308 0.35226130 -0.16600812 0.09555784 0.572527418 0.025237816 0.006401385 0.02450594
energy_demand 0.950980801 0.832402107 0.69960726 0.20376123 -0.28491923 -0.02150207 0.13592024 0.006346723 -0.509531109 0.123373405 0.08048677
weather_main energy_demand
E_1 -0.004452486 0.950980801
E_2 0.011227066 0.832402107
E_25 -0.021361874 0.699607258
temp -0.197153079 0.203761230
humidity 0.352261299 -0.284919235
pressure -0.166008124 -0.021502073
wind_speed 0.095557837 0.135920236
rain_duration 0.572527418 0.006346723
day_night 0.025237816 -0.509531109
time_band 0.006401385 0.123373405
season 0.024505939 0.080486767
weather_main 1.000000000 -0.022175596
energy_demand -0.022175596 1.000000000
corrplot(cor(workcor), method="color", type="full", addCoef.col = "red", tl.col="black",number.cex = 0.75)
workModelSeville1 <- lm(energy_demand ~ E_1 + E_2 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workSeville)
vif(workModelSeville1)
GVIF Df GVIF^(1/(2*Df))
E_1 14.477427 1 3.804921
E_2 12.448458 1 3.528237
E_25 2.162420 1 1.470517
temp 3.784144 1 1.945288
humidity 2.931415 1 1.712138
pressure 1.604205 1 1.266572
wind_speed 1.210197 1 1.100089
rain_duration 2.059981 1 1.435263
factor(day_night) 1.928208 1 1.388599
factor(time_band) 1.116888 2 1.028022
factor(season) 2.801229 3 1.187294
factor(weather_main) 2.926601 10 1.055160
workModelSevilleFull <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workSeville)
vif(workModelSevilleFull)
GVIF Df GVIF^(1/(2*Df))
E_1 2.048562 1 1.431280
E_25 1.994263 1 1.412184
temp 3.726333 1 1.930371
humidity 2.924283 1 1.710053
pressure 1.596200 1 1.263408
wind_speed 1.195560 1 1.093417
rain_duration 2.059975 1 1.435261
factor(day_night) 1.833637 1 1.354119
factor(time_band) 1.113687 2 1.027285
factor(season) 2.762347 3 1.184531
factor(weather_main) 2.912232 10 1.054900
# to obtain residuals
res.fullmodel1 <- residuals(workModelSevilleFull)
# to obtain standardized residuals
std.res.fullmodel1 <- rstandard(workModelSevilleFull)
# to obtain fitted/predicted values
pred.fullmodel1 <- fitted.values(workModelSevilleFull)
par(mfrow=c(1,1))
qqnorm(y = std.res.fullmodel1, main = " Normal Q-Q Plot ",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(y = std.res.fullmodel1)
#residual plot
resplotdata1 <- data.frame(std.res.fullmodel1, pred.fullmodel1)
resbf1 <- lm(std.res.fullmodel1 ~ pred.fullmodel1, data = resplotdata1)
plot(x = pred.fullmodel1, y = std.res.fullmodel1, ylab = "Standardized Residuals", xlab = "Predicted Values", main = "Residuals Plot", col = ifelse(std.res.fullmodel1 < -3,"red",ifelse(std.res.fullmodel1 > 3,"red","black")))
abline(h = 0, col="blue", lty=1)
abline(resbf1, col="red", lty=3)
abline(h = 3, col="green", lty=3)
abline(h=-3, col="green", lty=3)
legend("bottomleft", legend=c("Best fit line of standardized residuals", "Horizontal line y = 0.0", "Horizontal line, y = +/- 3"), fill = c("red","blue","green"), cex = 0.5)
summary(workModelSevilleFull)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + rain_duration + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main), data = workSeville)
Residuals:
Min 1Q Median 3Q Max
-549.95 -30.21 4.87 33.75 478.29
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -932.840821 68.808830 -13.557 < 0.0000000000000002 ***
E_1 0.890426 0.002422 367.692 < 0.0000000000000002 ***
E_25 0.111392 0.002398 46.461 < 0.0000000000000002 ***
temp 1.515006 0.077855 19.459 < 0.0000000000000002 ***
humidity -0.355891 0.024362 -14.608 < 0.0000000000000002 ***
pressure 0.483766 0.060212 8.034 0.000000000000000977 ***
wind_speed 2.987490 0.190422 15.689 < 0.0000000000000002 ***
rain_duration 3.732190 2.169563 1.720 0.085398 .
factor(day_night)2 26.545740 0.882968 30.064 < 0.0000000000000002 ***
factor(time_band)2 -2.706043 4.383971 -0.617 0.537069
factor(time_band)3 25.388655 3.414551 7.435 0.000000000000107216 ***
factor(season)2 -17.569908 1.092649 -16.080 < 0.0000000000000002 ***
factor(season)3 -5.339033 0.962640 -5.546 0.000000029451488123 ***
factor(season)4 6.990646 1.115924 6.264 0.000000000379592466 ***
factor(weather_main)2 2.242919 0.892504 2.513 0.011974 *
factor(weather_main)3 -12.318180 4.275098 -2.881 0.003962 **
factor(weather_main)4 -14.457822 3.478341 -4.157 0.000032408148010203 ***
factor(weather_main)5 -15.316699 3.093864 -4.951 0.000000743884166897 ***
factor(weather_main)6 -35.864988 3.428151 -10.462 < 0.0000000000000002 ***
factor(weather_main)7 -7.805466 2.448502 -3.188 0.001435 **
factor(weather_main)8 4.944984 2.054759 2.407 0.016108 *
factor(weather_main)9 41.459354 11.394392 3.639 0.000275 ***
factor(weather_main)11 26.081350 54.527770 0.478 0.632431
factor(weather_main)12 7.346205 5.619242 1.307 0.191111
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 54.52 on 27976 degrees of freedom
Multiple R-squared: 0.92, Adjusted R-squared: 0.9199
F-statistic: 1.399e+04 on 23 and 27976 DF, p-value: < 0.00000000000000022
df<- workModelSevilleFull %>%
tidy()
#Get the variables which there p-value larger than 0.05
df%>%
filter(df$p.value>0.05)
workModelSevillePvalue <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workSeville)
summary(workModelSevillePvalue)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main), data = workSeville)
Residuals:
Min 1Q Median 3Q Max
-549.95 -30.16 4.88 33.74 478.28
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -926.766568 68.720577 -13.486 < 0.0000000000000002 ***
E_1 0.890376 0.002422 367.685 < 0.0000000000000002 ***
E_25 0.111475 0.002397 46.503 < 0.0000000000000002 ***
temp 1.513449 0.077853 19.440 < 0.0000000000000002 ***
humidity -0.354561 0.024351 -14.561 < 0.0000000000000002 ***
pressure 0.478155 0.060126 7.953 0.00000000000000189 ***
wind_speed 2.994190 0.190388 15.727 < 0.0000000000000002 ***
factor(day_night)2 26.504162 0.882668 30.027 < 0.0000000000000002 ***
factor(time_band)2 -2.815274 4.383665 -0.642 0.520736
factor(time_band)3 25.368138 3.414650 7.429 0.00000000000011237 ***
factor(season)2 -17.582847 1.092661 -16.092 < 0.0000000000000002 ***
factor(season)3 -5.340872 0.962673 -5.548 0.00000002916363891 ***
factor(season)4 6.970419 1.115901 6.246 0.00000000042592646 ***
factor(weather_main)2 2.210683 0.892339 2.477 0.013240 *
factor(weather_main)3 -11.990918 4.271012 -2.808 0.004996 **
factor(weather_main)4 -14.479694 3.478440 -4.163 0.00003154496399759 ***
factor(weather_main)5 -15.332309 3.093959 -4.956 0.00000072541913847 ***
factor(weather_main)6 -35.873529 3.428268 -10.464 < 0.0000000000000002 ***
factor(weather_main)7 -7.783363 2.448554 -3.179 0.001481 **
factor(weather_main)8 7.245024 1.560299 4.643 0.00000344352885666 ***
factor(weather_main)9 41.439988 11.394785 3.637 0.000277 ***
factor(weather_main)11 26.078549 54.529679 0.478 0.632480
factor(weather_main)12 8.050709 5.604495 1.436 0.150879
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 54.52 on 27977 degrees of freedom
Multiple R-squared: 0.92, Adjusted R-squared: 0.9199
F-statistic: 1.462e+04 on 22 and 27977 DF, p-value: < 0.00000000000000022
anova(workModelSevillePvalue, workModelSevilleFull)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27977 83169196
2 27976 83160400 1 8796.6 2.9593 0.0854 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
bestfits <- regsubsets(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workSeville, nbest = 1)
plot(bestfits, scale="adjr2")
workModelSevilleBestfit<-lm(energy_demand ~ E_1 + E_25 + temp + humidity + wind_speed + factor(day_night) + factor(season) , data = workSeville)
summary(workModelSevilleBestfit)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + wind_speed +
factor(day_night) + factor(season), data = workSeville)
Residuals:
Min 1Q Median 3Q Max
-548.84 -30.38 5.07 33.74 477.84
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -393.271898 23.487782 -16.744 < 0.0000000000000002 ***
E_1 0.890138 0.002426 366.882 < 0.0000000000000002 ***
E_25 0.113507 0.002400 47.294 < 0.0000000000000002 ***
temp 1.352323 0.076360 17.710 < 0.0000000000000002 ***
humidity -0.377995 0.022132 -17.079 < 0.0000000000000002 ***
wind_speed 3.012031 0.182150 16.536 < 0.0000000000000002 ***
factor(day_night)2 25.972373 0.870245 29.845 < 0.0000000000000002 ***
factor(season)2 -18.083837 1.077474 -16.784 < 0.0000000000000002 ***
factor(season)3 -4.877645 0.960503 -5.078 0.000000383 ***
factor(season)4 9.565773 1.029338 9.293 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 54.85 on 27990 degrees of freedom
Multiple R-squared: 0.919, Adjusted R-squared: 0.9189
F-statistic: 3.527e+04 on 9 and 27990 DF, p-value: < 0.00000000000000022
anova(workModelSevilleBestfit, workModelSevillePvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + wind_speed + factor(day_night) +
factor(season)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27990 84221007
2 27977 83169196 13 1051811 27.216 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
workNullModel<- lm(energy_demand ~ 1, data = workSeville)
step.working <- stepAIC(workNullModel, scope = list(lower = workNullModel,
upper = workModelSevilleFull), direction = "forward", trace=FALSE)
summary(step.working)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + humidity + factor(day_night) +
wind_speed + factor(season) + temp + factor(weather_main) +
pressure + factor(time_band) + rain_duration, data = workSeville)
Residuals:
Min 1Q Median 3Q Max
-549.95 -30.21 4.87 33.75 478.29
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -932.840821 68.808830 -13.557 < 0.0000000000000002 ***
E_1 0.890426 0.002422 367.692 < 0.0000000000000002 ***
E_25 0.111392 0.002398 46.461 < 0.0000000000000002 ***
humidity -0.355891 0.024362 -14.608 < 0.0000000000000002 ***
factor(day_night)2 26.545740 0.882968 30.064 < 0.0000000000000002 ***
wind_speed 2.987490 0.190422 15.689 < 0.0000000000000002 ***
factor(season)2 -17.569908 1.092649 -16.080 < 0.0000000000000002 ***
factor(season)3 -5.339033 0.962640 -5.546 0.000000029451488123 ***
factor(season)4 6.990646 1.115924 6.264 0.000000000379592466 ***
temp 1.515006 0.077855 19.459 < 0.0000000000000002 ***
factor(weather_main)2 2.242919 0.892504 2.513 0.011974 *
factor(weather_main)3 -12.318180 4.275098 -2.881 0.003962 **
factor(weather_main)4 -14.457822 3.478341 -4.157 0.000032408148010197 ***
factor(weather_main)5 -15.316699 3.093864 -4.951 0.000000743884166896 ***
factor(weather_main)6 -35.864988 3.428151 -10.462 < 0.0000000000000002 ***
factor(weather_main)7 -7.805466 2.448502 -3.188 0.001435 **
factor(weather_main)8 4.944984 2.054759 2.407 0.016108 *
factor(weather_main)9 41.459354 11.394392 3.639 0.000275 ***
factor(weather_main)11 26.081350 54.527770 0.478 0.632431
factor(weather_main)12 7.346205 5.619242 1.307 0.191111
pressure 0.483766 0.060212 8.034 0.000000000000000977 ***
factor(time_band)2 -2.706043 4.383971 -0.617 0.537069
factor(time_band)3 25.388655 3.414551 7.435 0.000000000000107216 ***
rain_duration 3.732190 2.169563 1.720 0.085398 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 54.52 on 27976 degrees of freedom
Multiple R-squared: 0.92, Adjusted R-squared: 0.9199
F-statistic: 1.399e+04 on 23 and 27976 DF, p-value: < 0.00000000000000022
# Set seed for reproducibility in CV
set.seed(123)
# Set up repeated k-fold cross-validation, with k=10
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.cv.work <- train(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workSeville, method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:11),
trControl = train.control)
Reordering variables and trying again:
step.cv.work$results
summary(step.cv.work$finalModel)
Subset selection object
23 Variables (and intercept)
Forced in Forced out
E_1 FALSE FALSE
E_25 FALSE FALSE
temp FALSE FALSE
humidity FALSE FALSE
pressure FALSE FALSE
wind_speed FALSE FALSE
rain_duration FALSE FALSE
factor(day_night)2 FALSE FALSE
factor(time_band)2 FALSE FALSE
factor(time_band)3 FALSE FALSE
factor(season)2 FALSE FALSE
factor(season)3 FALSE FALSE
factor(season)4 FALSE FALSE
factor(weather_main)2 FALSE FALSE
factor(weather_main)3 FALSE FALSE
factor(weather_main)4 FALSE FALSE
factor(weather_main)5 FALSE FALSE
factor(weather_main)6 FALSE FALSE
factor(weather_main)7 FALSE FALSE
factor(weather_main)8 FALSE FALSE
factor(weather_main)9 FALSE FALSE
factor(weather_main)11 FALSE FALSE
factor(weather_main)12 FALSE FALSE
1 subsets of each size up to 11
Selection Algorithm: backward
E_1 E_25 temp humidity pressure wind_speed rain_duration factor(day_night)2 factor(time_band)2 factor(time_band)3 factor(season)2
1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" "*" " " "*" " " " " " " " " " " " " " "
4 ( 1 ) "*" "*" " " "*" " " " " " " "*" " " " " " "
5 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " " "
6 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*"
7 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
8 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
9 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*"
10 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " "*" "*"
11 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" " " "*" "*"
factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)4 factor(weather_main)5 factor(weather_main)6
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " " " "
6 ( 1 ) " " " " " " " " " " " " " "
7 ( 1 ) " " " " " " " " " " " " " "
8 ( 1 ) " " "*" " " " " " " " " " "
9 ( 1 ) " " "*" " " " " " " " " "*"
10 ( 1 ) " " "*" " " " " " " " " "*"
11 ( 1 ) " " "*" " " " " " " " " "*"
factor(weather_main)7 factor(weather_main)8 factor(weather_main)9 factor(weather_main)11 factor(weather_main)12
1 ( 1 ) " " " " " " " " " "
2 ( 1 ) " " " " " " " " " "
3 ( 1 ) " " " " " " " " " "
4 ( 1 ) " " " " " " " " " "
5 ( 1 ) " " " " " " " " " "
6 ( 1 ) " " " " " " " " " "
7 ( 1 ) " " " " " " " " " "
8 ( 1 ) " " " " " " " " " "
9 ( 1 ) " " " " " " " " " "
10 ( 1 ) " " " " " " " " " "
11 ( 1 ) " " " " " " " " " "
Reduced model same as P-value approach
#ModelOLS <- ols_step_all_possible(workModelSevilleFull)
#cat("\n")
#plot(x = ModelOLS$n, y = ModelOLS$adjr)
#plot(x = ModelOLS$n, y = ModelOLS$aic)
#OLSsummary <- ModelOLS %>% group_by(n) %>% filter(adjr == max(adjr))
Same reduced model as P-Value approach (thus commenting out given processing time)
Model adjusted R-square from above methods (to four significant digits)
Full: 0.9199
P-value: 0.9199
Bestfits: 0.9189
AIC Method: 0.9199
Cross-validation: 0.9199
Final model is the workModelSevillePvalue with highest Adjusted R-square and fewest predictors.
Method <- c('Full','P-value', 'BestFits', 'AIC Forward', 'Cross Validation')
Variables <- c(12,11,8,12,11)
Coefficients <- c(24,23,10,24,23)
Adjusted_R2 <- c(0.9199264,0.9199208,0.9189457,0.9199264,0.9199208)
SevilleTable<-data.frame(Method,Variables,Coefficients,Adjusted_R2)
workFinal <- workModelSevillePvalue
par(mfrow=c(2,2))
#Residual Plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
#norm test
std.res <- rstandard(workFinal)
ad.test(std.res)
Anderson-Darling normality test
data: std.res
A = 131.25, p-value < 0.00000000000000022
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
#leverage
lev<-hatvalues(workFinal)
cutlev = (2*length(coefficients(workFinal)))/nrow(workSeville)
# Count and assess the number of leverage values > cut-off
potoutlier <- sum(lev > cutlev, na.rm=TRUE)
totcount = length(fitted(workFinal))
print(paste("The number of leverage points that are potential outliers is", potoutlier,". This is", round(100*potoutlier/totcount,2),"% of the total number of predicted values, (", totcount,") thus not material."))
[1] "The number of leverage points that are potential outliers is 2134 . This is 7.62 % of the total number of predicted values, ( 28000 ) thus not material."
#barplot(lev, ylim = c(0, 2*cutlev))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working set")
abline(h = cutlev, col = "red", lty=2)
#cook distance
cookdist<-cooks.distance(workFinal)
#barplot(cookdist, ylim=c(0,1.01), main = "Cook's Distance plot")
#abline(h = 1, col = "red")
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.03,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
# Opening the graphical device
# Customizing the output
pdf("SevilleAssumptionCheck1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
# par(mfrow=c(2,2))
#residuals scatter plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
grid(10,10)
dev.off()
null device
1
#normality plot
pdf("SevilleAssumptionCheck2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
grid(10,10)
dev.off()
null device
1
#leverage plot
pdf("SevilleAssumptionCheck3.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Barcelona working set")
abline(h = cutlev, col = "red", lty=2)
grid(10,10)
dev.off()
null device
1
#Cook's distance plot
pdf("SevilleAssumptionCheck4.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.005,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
dev.off()
null device
1
newdata <- evalSeville[ c(-2,-13)]
predict.eval <- predict(workFinal, newdata)
plot(x = evalSeville$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Seville")
abline(a=0, b=1, col="blue")
grid(10,10)
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("SevilleEvalActComp.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,1))
plot(x = evalSeville$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Seville", cex.lab = 0.75)
abline(a=0, b=1, col="blue")
grid(10,10)
# Closing the graphical device
dev.off()
null device
1
rootmse <- rmse(evalSeville$energy_demand, predict.eval)
rootmse <- rmse(evaldata$energy_demand, predict.eval)
print(paste("The root mean square error of the predicted vs actual",nrow(evaldata),"hourly energy_data points for Seville over the 2015-2018 timespan is an energy demand difference of",round(rootmse,2),"MWh."))
[1] "The root mean square error of the predicted vs actual 7001 hourly energy_data points for Seville over the 2015-2018 timespan is an energy demand difference of 54.39 MWh."
cat("\n")
demandcompdf <- data.frame("Actual (MWh)" = evalSeville$energy_demand,"Predicted (MWh)" = predict.eval)
head(demandcompdf,4L)
cat("\n")
print(paste("For example, the first actual demand value is",round(evaldata$energy_demand[1],2),"MWh and the corresponding predicted value is",round(predict.eval[1],2),"MWh. The percent error between this predicted value relative to the actual value is",round((100*(predict.eval[1] - evaldata$energy_demand[1])/evaldata$energy_demand[1]),2),"%."))
[1] "For example, the first actual demand value is 1025.77 MWh and the corresponding predicted value is 962.18 MWh. The percent error between this predicted value relative to the actual value is -6.2 %."
cat("\n")
diffpct <- c((abs(evaldata$energy_demand - predict.eval))/evaldata$energy_demand)
print(paste("The mean difference between the",nrow(evaldata),"evaluation data set actual and model predicted values is",round(100*mean(diffpct),2),"%."))
[1] "The mean difference between the 7001 evaluation data set actual and model predicted values is 3.49 %."
Obtain X matrix from the model
Predictors <-rownames(summary(workFinal)$coefficients[,])
workpredictors <- as.data.frame(Predictors)
X <- model.matrix(workFinal)
class(X)
[1] "matrix" "array"
cat("\n")
dim(X)
[1] 28000 23
cat("\n")
head(X, 4L)
(Intercept) E_1 E_25 temp humidity pressure wind_speed factor(day_night)2 factor(time_band)2 factor(time_band)3 factor(season)2
1 1 1025.7701 1138.8564 273.375 75 1039 1 1 0 0 0
2 1 895.5189 976.6314 274.086 71 1039 3 1 0 0 0
3 1 852.5226 948.2757 274.086 71 1039 3 1 0 0 0
4 1 837.4192 929.5963 274.086 71 1039 3 1 0 0 0
factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)4 factor(weather_main)5 factor(weather_main)6
1 0 1 0 0 0 0 0
2 0 1 0 0 0 0 0
3 0 1 0 0 0 0 0
4 0 1 0 0 0 0 0
factor(weather_main)7 factor(weather_main)8 factor(weather_main)9 factor(weather_main)11 factor(weather_main)12
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
Create x_new matrix using the validation data. Note that the purpose of running the evaluation regression here is to extract all of the correct predictor coefficients.
evalFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main), data = evalSeville)
Predictors <-rownames(summary(evalFinal)$coefficients[,])
evalpredictors <- as.data.frame(Predictors)
x_new1 <- model.matrix(evalFinal)
cat("\n")
dim(x_new1)
[1] 7001 22
cat("\n")
#Check if there's a predictor difference between the work and eval matrices
workevalcompare <-anti_join(workpredictors,evalpredictors)
Joining, by = "Predictors"
cat("\n")
#If workevalcompare is 'No data available in table', run the next line of code and skip to line 1580
#x_new <- x_new1
#Otherwise when workevalcompare is not 'No data available in table', run the following lines of code after identifying which column is missing between X and x_new1 (for Seville, it was found that "factor(weather_main)11" appears in workSeville but not in evalSeville).
x_new_ <-as.data.frame(cbind(x_new1,rep(0,nrow(x_new1))))
dummy_xnew<- x_new_ %>% relocate(V23,.before = 22) %>% rename("factor(weather_main)11" = V23)
x_new <- as.matrix(dummy_xnew)
cat("\n")
class(x_new)
[1] "matrix" "array"
cat("\n")
dim(x_new)
[1] 7001 23
cat("\n")
head(x_new, 4L)
(Intercept) E_1 E_25 temp humidity pressure wind_speed factor(day_night)2 factor(time_band)2 factor(time_band)3 factor(season)2
1 1 956.4375 1049.0353 273.375 75 1039 1 1 0 0 0
2 1 845.3706 968.6801 274.592 81 1039 4 1 0 0 0
3 1 868.2150 997.0358 275.651 73 1041 2 0 1 0 0
4 1 972.5926 1176.8463 286.994 53 1036 3 0 1 0 0
factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)4 factor(weather_main)5 factor(weather_main)6
1 0 1 0 0 0 0 0
2 0 1 0 0 0 0 0
3 0 1 0 0 0 0 0
4 0 1 0 0 0 0 0
factor(weather_main)7 factor(weather_main)8 factor(weather_main)9 factor(weather_main)11 factor(weather_main)12
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
Compute the hat values for the validation data
h_new_mid <- solve(t(X)%*%X)
dim(h_new_mid)
[1] 23 23
h_new <- x_new%*%h_new_mid%*%t(x_new)
dim(h_new)
[1] 7001 7001
Plot the hatvalues for the data in the evaluation set
#cutlev2 = (2*length(coefficients(evalFinal)))/nrow(evalSeville)
# Count and assess the number of leverage values > cut-off
potoutlier2 <- sum(diag(h_new) > cutlev, na.rm=TRUE)
totcount2 = nrow(evalSeville)
plot(diag(h_new), type = "h", ylab = "Leverage for the validation data set", ylim = c(0,2*cutlev), main = "Seville")
abline(h=cutlev, col="red", lty=2)
print(paste("The number of leverage points that are potential outliers is", potoutlier2,". This is", round(100*potoutlier2/totcount2,2),"% of the total number of predicted values, (", totcount2,") thus not material."))
[1] "The number of leverage points that are potential outliers is 532 . This is 7.6 % of the total number of predicted values, ( 7001 ) thus not material."
Side-by-side comparision
par(mfrow=c(1,2))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working data set", main = "Seville")
abline(h = cutlev, col = "red", lty=2)
plot(diag(h_new), type = "h", ylim = c(0,2*cutlev), ylab = "Leverage for the evaluation data set", main = "Seville")
abline(h=cutlev, col="red", lty=2)
print(paste("The leverage cutoff exceedences in the evaluation set is",round(100*potoutlier2/totcount2,2),"% of the total number of observed values (", totcount2,") and is comparable to that of the working set,", round(100*potoutlier/totcount,2),"% of the total number of predicted values (", totcount,")."))
[1] "The leverage cutoff exceedences in the evaluation set is 7.6 % of the total number of observed values ( 7001 ) and is comparable to that of the working set, 7.62 % of the total number of predicted values ( 28000 )."
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("SevilleLevComp.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,2))
plot(lev, type="h", ylab = "Leverage", ylim = c(0,2*cutlev), yaxt = 'n', main = "Seville Working Data", res =600)
abline(h = cutlev, col = "red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
plot(diag(h_new), type = "h", ylab = "", ylim = c(0,2*cutlev), yaxt = 'n', main = "Seville Evaluation Data")
abline(h=cutlev, col="red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
# Closing the graphical device
dev.off()
null device
1
evaldata<-read.csv('data/evaluation_Valencia.csv')
workdata<-read.csv('data/working_Valencia.csv')
#remove index and time_ID columns from "eval" analysis dataframe
evalValencia<-evaldata[c(-1,-2)]
#remove index and time_ID columns from "work" analysis dataframe
workValencia<-workdata[c(-1,-2)]
head(workValencia, 4L)
# Correlation matrix for numeric predictors only (categorical predictors excluded)
workcor<-workValencia[c(-10:-13) ]
ggpairs(workcor)
cor(workValencia)
E_1 E_2 E_25 temp humidity pressure wind_speed rain_duration snow_duration day_night
E_1 1.000000000 0.951054128 0.666939844 0.19109473 -0.28730452 0.01627940 0.13872213 0.0268061475 -0.005782161 -0.54254634
E_2 0.951054128 1.000000000 0.571164357 0.15367732 -0.25124085 0.01529907 0.12787798 0.0220813226 -0.005757167 -0.53504092
E_25 0.666939844 0.571164357 1.000000000 0.21893597 -0.31975014 0.01584015 0.16516026 0.0095613923 -0.005214940 -0.51180659
temp 0.191094734 0.153677316 0.218935967 1.00000000 -0.40538712 -0.12401369 0.08321668 -0.0828673326 -0.017524492 -0.38474840
humidity -0.287304516 -0.251240855 -0.319750141 -0.40538712 1.00000000 0.07482097 -0.38567524 0.1341640999 -0.011280937 0.44420938
pressure 0.016279401 0.015299073 0.015840152 -0.12401369 0.07482097 1.00000000 -0.12331451 -0.0731064209 -0.016919098 0.05327426
wind_speed 0.138722130 0.127877981 0.165160263 0.08321668 -0.38567524 -0.12331451 1.00000000 0.0235262566 0.011156495 -0.18975982
rain_duration 0.026806148 0.022081323 0.009561392 -0.08286733 0.13416410 -0.07310642 0.02352626 1.0000000000 0.009657072 -0.01079121
snow_duration -0.005782161 -0.005757167 -0.005214940 -0.01752449 -0.01128094 -0.01691910 0.01115650 0.0096570721 1.000000000 -0.01016594
day_night -0.542546338 -0.535040922 -0.511806591 -0.38474840 0.44420938 0.05327426 -0.18975982 -0.0107912145 -0.010165940 1.00000000
time_band 0.110986153 0.091678654 0.113058687 -0.13846625 -0.01513187 0.02024851 0.13793701 -0.0001837309 0.085006810 -0.00297295
season 0.078852572 0.079163395 0.077294925 -0.33696550 0.15583643 0.26596837 0.04690817 -0.0457883003 0.014244862 0.15019176
weather_main 0.032252910 0.025255947 0.020597389 -0.06837167 0.23950590 -0.12315269 0.04802916 0.4778286676 0.035801103 -0.03744148
energy_demand 0.951146210 0.832911624 0.700323450 0.22238880 -0.31157854 0.01880238 0.14782748 0.0294762210 -0.006658955 -0.50985063
time_band season weather_main energy_demand
E_1 0.1109861526 0.07885257 0.03225291 0.951146210
E_2 0.0916786545 0.07916339 0.02525595 0.832911624
E_25 0.1130586874 0.07729493 0.02059739 0.700323450
temp -0.1384662509 -0.33696550 -0.06837167 0.222388801
humidity -0.0151318666 0.15583643 0.23950590 -0.311578541
pressure 0.0202485123 0.26596837 -0.12315269 0.018802377
wind_speed 0.1379370082 0.04690817 0.04802916 0.147827481
rain_duration -0.0001837309 -0.04578830 0.47782867 0.029476221
snow_duration 0.0850068105 0.01424486 0.03580110 -0.006658955
day_night -0.0029729496 0.15019176 -0.03744148 -0.509850631
time_band 1.0000000000 0.16430656 0.01155377 0.123586398
season 0.1643065599 1.00000000 -0.02378259 0.078241487
weather_main 0.0115537696 -0.02378259 1.00000000 0.037878776
energy_demand 0.1235863979 0.07824149 0.03787878 1.000000000
corrplot(cor(workcor), method="color", type="full", addCoef.col = "red", tl.col="black",number.cex = 0.75)
workModelValencia1 <- lm(energy_demand ~ E_1 + E_2 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia)
vif(workModelValencia1)
GVIF Df GVIF^(1/(2*Df))
E_1 14.171607 1 3.764520
E_2 12.017128 1 3.466573
E_25 2.156342 1 1.468449
temp 3.403566 1 1.844876
humidity 1.994951 1 1.412427
pressure 1.139913 1 1.067667
wind_speed 1.331132 1 1.153747
rain_duration 1.456960 1 1.207046
snow_duration 1.011179 1 1.005574
factor(day_night) 1.965917 1 1.402112
factor(time_band) 1.111991 2 1.026893
factor(season) 3.270467 3 1.218339
factor(weather_main) 1.816528 8 1.038013
workModelValenciaFull <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia)
vif(workModelValenciaFull)
GVIF Df GVIF^(1/(2*Df))
E_1 2.039961 1 1.428272
E_25 1.973528 1 1.404823
temp 3.339423 1 1.827409
humidity 1.993134 1 1.411784
pressure 1.139889 1 1.067656
wind_speed 1.329233 1 1.152924
rain_duration 1.456598 1 1.206896
snow_duration 1.011170 1 1.005569
factor(day_night) 1.889044 1 1.374425
factor(time_band) 1.108226 2 1.026023
factor(season) 3.226785 3 1.215612
factor(weather_main) 1.813697 8 1.037911
# to obtain residuals
res.fullmodel1 <- residuals(workModelValenciaFull)
# to obtain standardized residuals
std.res.fullmodel1 <- rstandard(workModelValenciaFull)
# to obtain fitted/predicted values
pred.fullmodel1 <- fitted.values(workModelValenciaFull)
par(mfrow=c(1,1))
qqnorm(y = std.res.fullmodel1, main = " Normal Q-Q Plot ",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(y = std.res.fullmodel1)
#residual plot
resplotdata1 <- data.frame(std.res.fullmodel1, pred.fullmodel1)
resbf1 <- lm(std.res.fullmodel1 ~ pred.fullmodel1, data = resplotdata1)
plot(x = pred.fullmodel1, y = std.res.fullmodel1, ylab = "Standardized Residuals", xlab = "Predicted Values", main = "Residuals Plot", col = ifelse(std.res.fullmodel1 < -3,"red",ifelse(std.res.fullmodel1 > 3,"red","black")))
abline(h = 0, col="blue", lty=1)
abline(resbf1, col="red", lty=3)
abline(h = 3, col="green", lty=3)
abline(h=-3, col="green", lty=3)
legend("bottomleft", legend=c("Best fit line of standardized residuals", "Horizontal line y = 0.0", "Horizontal line, y = +/- 3"), fill = c("red","blue","green"), cex = 0.5)
summary(workModelValenciaFull)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + rain_duration + snow_duration + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main),
data = workValencia)
Residuals:
Min 1Q Median 3Q Max
-714.78 -41.44 6.33 45.09 647.09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1003.690971 56.667174 -17.712 < 0.0000000000000002 ***
E_1 0.884855 0.002462 359.354 < 0.0000000000000002 ***
E_25 0.118354 0.002424 48.819 < 0.0000000000000002 ***
temp 3.036519 0.111534 27.225 < 0.0000000000000002 ***
humidity -0.325270 0.031569 -10.303 < 0.0000000000000002 ***
pressure 0.123376 0.046657 2.644 0.00819 **
wind_speed -0.852805 0.194388 -4.387 0.00001152758427349 ***
rain_duration 6.035686 1.965320 3.071 0.00213 **
snow_duration -48.622588 51.418802 -0.946 0.34435
factor(day_night)2 34.469671 1.211297 28.457 < 0.0000000000000002 ***
factor(time_band)2 2.754287 6.014925 0.458 0.64702
factor(time_band)3 44.964025 4.505279 9.980 < 0.0000000000000002 ***
factor(season)2 -26.939800 1.654365 -16.284 < 0.0000000000000002 ***
factor(season)3 -10.371166 1.345340 -7.709 0.00000000000001311 ***
factor(season)4 9.712012 1.440856 6.740 0.00000000001610023 ***
factor(weather_main)2 4.982183 0.968803 5.143 0.00000027278240873 ***
factor(weather_main)3 26.571095 11.630332 2.285 0.02234 *
factor(weather_main)5 -3.902488 12.443279 -0.314 0.75381
factor(weather_main)6 28.645610 51.837893 0.553 0.58054
factor(weather_main)7 15.461673 6.125109 2.524 0.01160 *
factor(weather_main)8 20.305312 2.564494 7.918 0.00000000000000251 ***
factor(weather_main)9 83.961927 32.799314 2.560 0.01048 *
factor(weather_main)12 12.036941 7.374031 1.632 0.10262
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 73.29 on 27682 degrees of freedom
Multiple R-squared: 0.9178, Adjusted R-squared: 0.9177
F-statistic: 1.404e+04 on 22 and 27682 DF, p-value: < 0.00000000000000022
df<- workModelValenciaFull %>%
tidy()
#Get the variables which there p-value larger than 0.05
df%>%
filter(df$p.value>0.05)
workModelValenciaPvalue <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia)
summary(workModelValenciaPvalue)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + pressure +
wind_speed + rain_duration + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main), data = workValencia)
Residuals:
Min 1Q Median 3Q Max
-714.80 -41.47 6.31 45.11 647.11
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1004.925992 56.652013 -17.739 < 0.0000000000000002 ***
E_1 0.884894 0.002462 359.422 < 0.0000000000000002 ***
E_25 0.118386 0.002424 48.837 < 0.0000000000000002 ***
temp 3.038226 0.111519 27.244 < 0.0000000000000002 ***
humidity -0.324448 0.031557 -10.281 < 0.0000000000000002 ***
pressure 0.123945 0.046653 2.657 0.00789 **
wind_speed -0.850685 0.194374 -4.377 0.0000121024974039 ***
rain_duration 6.061477 1.965127 3.085 0.00204 **
factor(day_night)2 34.492451 1.211055 28.481 < 0.0000000000000002 ***
factor(time_band)2 2.569012 6.011721 0.427 0.66914
factor(time_band)3 44.633275 4.491673 9.937 < 0.0000000000000002 ***
factor(season)2 -26.966190 1.654127 -16.302 < 0.0000000000000002 ***
factor(season)3 -10.388206 1.345217 -7.722 0.0000000000000118 ***
factor(season)4 9.698814 1.440786 6.732 0.0000000000171069 ***
factor(weather_main)2 4.975234 0.968773 5.136 0.0000002831463489 ***
factor(weather_main)3 26.556133 11.630299 2.283 0.02242 *
factor(weather_main)5 -3.927628 12.443227 -0.316 0.75228
factor(weather_main)6 28.642884 51.837794 0.553 0.58058
factor(weather_main)7 15.438547 6.125048 2.521 0.01172 *
factor(weather_main)8 20.197124 2.561935 7.884 0.0000000000000033 ***
factor(weather_main)9 83.939146 32.799242 2.559 0.01050 *
factor(weather_main)12 12.012127 7.373970 1.629 0.10333
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 73.29 on 27683 degrees of freedom
Multiple R-squared: 0.9178, Adjusted R-squared: 0.9177
F-statistic: 1.471e+04 on 21 and 27683 DF, p-value: < 0.00000000000000022
anova(workModelValenciaPvalue, workModelValenciaFull)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + snow_duration + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27683 148683378
2 27682 148678576 1 4802.7 0.8942 0.3444
bestfits <- regsubsets(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia, nbest = 1)
plot(bestfits, scale="adjr2")
workModelValenciaBestfit<-lm(energy_demand ~ E_1 + E_25 + temp + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia)
summary(workModelValenciaBestfit)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main),
data = workValencia)
Residuals:
Min 1Q Median 3Q Max
-713.54 -41.68 6.32 45.48 645.50
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1026.388603 29.108302 -35.261 < 0.0000000000000002 ***
E_1 0.885725 0.002466 359.232 < 0.0000000000000002 ***
E_25 0.120298 0.002420 49.719 < 0.0000000000000002 ***
temp 3.465467 0.100419 34.510 < 0.0000000000000002 ***
factor(day_night)2 32.134863 1.187753 27.055 < 0.0000000000000002 ***
factor(time_band)2 0.989703 6.007040 0.165 0.86914
factor(time_band)3 43.479302 4.476508 9.713 < 0.0000000000000002 ***
factor(season)2 -31.757105 1.552625 -20.454 < 0.0000000000000002 ***
factor(season)3 -13.493634 1.286982 -10.485 < 0.0000000000000002 ***
factor(season)4 10.088154 1.382228 7.298 0.000000000000298884 ***
factor(weather_main)2 2.443658 0.936693 2.609 0.00909 **
factor(weather_main)3 20.134668 11.638553 1.730 0.08364 .
factor(weather_main)5 -11.784953 12.439785 -0.947 0.34346
factor(weather_main)6 26.107025 51.946095 0.503 0.61526
factor(weather_main)7 8.294013 6.087548 1.362 0.17307
factor(weather_main)8 17.024099 2.091205 8.141 0.000000000000000409 ***
factor(weather_main)9 84.584591 32.867921 2.573 0.01007 *
factor(weather_main)12 4.984582 7.346590 0.678 0.49747
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 73.44 on 27687 degrees of freedom
Multiple R-squared: 0.9174, Adjusted R-squared: 0.9174
F-statistic: 1.809e+04 on 17 and 27687 DF, p-value: < 0.00000000000000022
anova(workModelValenciaBestfit, workModelValenciaPvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + factor(day_night) + factor(time_band) +
factor(season) + factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27687 149330620
2 27683 148683378 4 647242 30.127 < 0.00000000000000022 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
workNullModel<- lm(energy_demand ~ 1, data = workValencia)
step.working <- stepAIC(workNullModel, scope = list(lower = workNullModel,
upper = workModelValenciaFull), direction = "forward", trace=FALSE)
summary(step.working)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + factor(day_night) +
factor(season) + factor(time_band) + factor(weather_main) +
humidity + wind_speed + rain_duration + pressure, data = workValencia)
Residuals:
Min 1Q Median 3Q Max
-714.80 -41.47 6.31 45.11 647.11
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1004.925992 56.652013 -17.739 < 0.0000000000000002 ***
E_1 0.884894 0.002462 359.422 < 0.0000000000000002 ***
E_25 0.118386 0.002424 48.837 < 0.0000000000000002 ***
temp 3.038226 0.111519 27.244 < 0.0000000000000002 ***
factor(day_night)2 34.492451 1.211055 28.481 < 0.0000000000000002 ***
factor(season)2 -26.966190 1.654127 -16.302 < 0.0000000000000002 ***
factor(season)3 -10.388206 1.345217 -7.722 0.0000000000000118 ***
factor(season)4 9.698814 1.440786 6.732 0.0000000000171069 ***
factor(time_band)2 2.569012 6.011721 0.427 0.66914
factor(time_band)3 44.633275 4.491673 9.937 < 0.0000000000000002 ***
factor(weather_main)2 4.975234 0.968773 5.136 0.0000002831463489 ***
factor(weather_main)3 26.556133 11.630299 2.283 0.02242 *
factor(weather_main)5 -3.927628 12.443227 -0.316 0.75228
factor(weather_main)6 28.642884 51.837794 0.553 0.58058
factor(weather_main)7 15.438547 6.125048 2.521 0.01172 *
factor(weather_main)8 20.197124 2.561935 7.884 0.0000000000000033 ***
factor(weather_main)9 83.939146 32.799242 2.559 0.01050 *
factor(weather_main)12 12.012127 7.373970 1.629 0.10333
humidity -0.324448 0.031557 -10.281 < 0.0000000000000002 ***
wind_speed -0.850685 0.194374 -4.377 0.0000121024974039 ***
rain_duration 6.061477 1.965127 3.085 0.00204 **
pressure 0.123945 0.046653 2.657 0.00789 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 73.29 on 27683 degrees of freedom
Multiple R-squared: 0.9178, Adjusted R-squared: 0.9177
F-statistic: 1.471e+04 on 21 and 27683 DF, p-value: < 0.00000000000000022
# Set seed for reproducibility in CV
set.seed(123)
# Set up repeated k-fold cross-validation, with k=10
train.control <- trainControl(method = "cv", number = 10)
# Train the model
step.cv.work <- train(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + snow_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia, method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:12),
trControl = train.control)
step.cv.work$results
summary(step.cv.work$finalModel)
Subset selection object
22 Variables (and intercept)
Forced in Forced out
E_1 FALSE FALSE
E_25 FALSE FALSE
temp FALSE FALSE
humidity FALSE FALSE
pressure FALSE FALSE
wind_speed FALSE FALSE
rain_duration FALSE FALSE
snow_duration FALSE FALSE
factor(day_night)2 FALSE FALSE
factor(time_band)2 FALSE FALSE
factor(time_band)3 FALSE FALSE
factor(season)2 FALSE FALSE
factor(season)3 FALSE FALSE
factor(season)4 FALSE FALSE
factor(weather_main)2 FALSE FALSE
factor(weather_main)3 FALSE FALSE
factor(weather_main)5 FALSE FALSE
factor(weather_main)6 FALSE FALSE
factor(weather_main)7 FALSE FALSE
factor(weather_main)8 FALSE FALSE
factor(weather_main)9 FALSE FALSE
factor(weather_main)12 FALSE FALSE
1 subsets of each size up to 12
Selection Algorithm: backward
E_1 E_25 temp humidity pressure wind_speed rain_duration snow_duration factor(day_night)2 factor(time_band)2 factor(time_band)3 factor(season)2
1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " " "
3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " " " " " "
4 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " " "
5 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " "*"
6 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " " " "*"
7 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " "*" "*"
8 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" " " "*" "*"
9 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" " " "*" "*"
10 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" " " "*" "*"
11 ( 1 ) "*" "*" "*" "*" " " " " " " " " "*" " " "*" "*"
12 ( 1 ) "*" "*" "*" "*" " " "*" " " " " "*" " " "*" "*"
factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)5 factor(weather_main)6 factor(weather_main)7
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " " " " " " " " "
4 ( 1 ) " " " " " " " " " " " " " "
5 ( 1 ) " " " " " " " " " " " " " "
6 ( 1 ) "*" " " " " " " " " " " " "
7 ( 1 ) "*" " " " " " " " " " " " "
8 ( 1 ) "*" " " " " " " " " " " " "
9 ( 1 ) "*" " " " " " " " " " " " "
10 ( 1 ) "*" "*" " " " " " " " " " "
11 ( 1 ) "*" "*" "*" " " " " " " " "
12 ( 1 ) "*" "*" "*" " " " " " " " "
factor(weather_main)8 factor(weather_main)9 factor(weather_main)12
1 ( 1 ) " " " " " "
2 ( 1 ) " " " " " "
3 ( 1 ) " " " " " "
4 ( 1 ) " " " " " "
5 ( 1 ) " " " " " "
6 ( 1 ) " " " " " "
7 ( 1 ) " " " " " "
8 ( 1 ) "*" " " " "
9 ( 1 ) "*" " " " "
10 ( 1 ) "*" " " " "
11 ( 1 ) "*" " " " "
12 ( 1 ) "*" " " " "
workModelValenciaCross <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = workValencia)
anova( workModelValenciaCross, workModelValenciaPvalue)
Analysis of Variance Table
Model 1: energy_demand ~ E_1 + E_25 + temp + humidity + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main)
Model 2: energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed +
rain_duration + factor(day_night) + factor(time_band) + factor(season) +
factor(weather_main)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 27686 148890185
2 27683 148683378 3 206807 12.835 0.00000002238 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
summary(workModelValenciaCross)
Call:
lm(formula = energy_demand ~ E_1 + E_25 + temp + humidity + factor(day_night) +
factor(time_band) + factor(season) + factor(weather_main),
data = workValencia)
Residuals:
Min 1Q Median 3Q Max
-711.96 -41.22 6.24 45.23 645.94
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -881.560408 33.180368 -26.569 < 0.0000000000000002 ***
E_1 0.885344 0.002462 359.549 < 0.0000000000000002 ***
E_25 0.118381 0.002425 48.811 < 0.0000000000000002 ***
temp 3.025870 0.111419 27.158 < 0.0000000000000002 ***
humidity -0.269465 0.029776 -9.050 < 0.0000000000000002 ***
factor(day_night)2 34.378128 1.211648 28.373 < 0.0000000000000002 ***
factor(time_band)2 0.123778 5.999046 0.021 0.9835
factor(time_band)3 42.172498 4.472314 9.430 < 0.0000000000000002 ***
factor(season)2 -26.574433 1.652752 -16.079 < 0.0000000000000002 ***
factor(season)3 -10.019958 1.341205 -7.471 0.000000000000082 ***
factor(season)4 9.681355 1.380945 7.011 0.000000000002426 ***
factor(weather_main)2 4.007721 0.951161 4.214 0.000025223409140 ***
factor(weather_main)3 24.594350 11.632030 2.114 0.0345 *
factor(weather_main)5 -4.027791 12.451190 -0.323 0.7463
factor(weather_main)6 28.171788 51.870872 0.543 0.5871
factor(weather_main)7 15.305422 6.127849 2.498 0.0125 *
factor(weather_main)8 21.905840 2.156707 10.157 < 0.0000000000000002 ***
factor(weather_main)9 84.119876 32.820047 2.563 0.0104 *
factor(weather_main)12 10.464185 7.360827 1.422 0.1552
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 73.33 on 27686 degrees of freedom
Multiple R-squared: 0.9176, Adjusted R-squared: 0.9176
F-statistic: 1.714e+04 on 18 and 27686 DF, p-value: < 0.00000000000000022
#ModelOLS <- ols_step_all_possible(workModelValenciaFull)
#cat("\n")
#plot(x = ModelOLS$n, y = ModelOLS$adjr)
#plot(x = ModelOLS$n, y = ModelOLS$aic)
#OLSsummary <- ModelOLS %>% group_by(n) %>% filter(adjr == max(adjr))
Same reduced model as P-Value approach (thus commenting out given processing time)
Model adjusted R-square from above methods
Full: 0.9177
P-value: 0.9177
Bestfits: 0.9174
AIC Method: 0.9177
Cross-validation: 0.9176
Final model is the workModelValenciaPvalue with highest Adjusted R-square.
Method <- c('Full','P-value', 'BestFits', 'AIC Forward', 'Corss Validation')
Variables <- c(13,12,8,12,9)
Coefficients <- c(23,22,18,22,19)
Adjusted_R <- c(0.9176974,0.9176977,0.9173514,0.9176977,0.9175922)
ValenciaTable<-data.frame(Method,Variables,Coefficients,Adjusted_R)
workFinal <- workModelValenciaPvalue
par(mfrow=c(2,2))
#Residual Plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
#norm test
std.res <- rstandard(workFinal)
ad.test(std.res)
Anderson-Darling normality test
data: std.res
A = 118.51, p-value < 0.00000000000000022
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
#leverage
lev<-hatvalues(workFinal)
cutlev = (2*length(coefficients(workFinal)))/nrow(workValencia)
# Count and assess the number of leverage values > cut-off
potoutlier <- sum(lev > cutlev, na.rm=TRUE)
totcount = length(fitted(workFinal))
print(paste("The number of leverage points that are potential outliers is", potoutlier,". This is", round(100*potoutlier/totcount,2),"% of the total number of predicted values, (", totcount,") thus not material."))
[1] "The number of leverage points that are potential outliers is 936 . This is 3.38 % of the total number of predicted values, ( 27705 ) thus not material."
#barplot(lev, ylim = c(0, 2*cutlev))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working set")
abline(h = cutlev, col = "red", lty=2)
#cook distance
cookdist<-cooks.distance(workFinal)
#barplot(cookdist, ylim=c(0,1.01), main = "Cook's Distance plot")
#abline(h = 1, col = "red")
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.05,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
# Opening the graphical device
# Customizing the output
pdf("ValenciaAssumptionCheck1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
# par(mfrow=c(2,2))
#residuals scatter plot
plot(x=fitted(workFinal), y=studres(workFinal), xlab = "Fitted Values",
ylab = "Studentized Residuals", main='Residuals Plot', col = ifelse(studres(workFinal) < -3,"red",ifelse(studres(workFinal) > 3,"red","black")))
abline(h=-3, col="red", lty=2)
abline(h=3, col="red", lty=2)
grid(10,10)
dev.off()
null device
1
#normality plot
pdf("ValenciaAssumptionCheck2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
qqnorm(studres(workFinal), pch = 1, frame = FALSE, main=expression("AD test =2.2*10"^-16))
qqline(studres(workFinal), col = "steelblue", lwd = 2)
grid(10,10)
dev.off()
null device
1
#leverage plot
pdf("ValenciaAssumptionCheck3.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for Barcelona working set")
abline(h = cutlev, col = "red", lty=2)
grid(10,10)
dev.off()
null device
1
#Cook's distance plot
pdf("ValenciaAssumptionCheck4.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
ggplot(as.data.frame(cookdist),aes(x=1:nrow(as.data.frame(cookdist)),y=cookdist)) +
geom_line() +
geom_hline(aes(yintercept = 1,color = "red"))+
scale_y_continuous(limits = c(0,1.1))+
scale_y_break(breaks = c(0.005,0.9), scales =0.1) +
guides(color = "none") +
labs(title="Cook's Distance",
x ="Index", y = "")
dev.off()
null device
1
newdata <- evalValencia[ c(-2,-14)]
predict.eval <- predict(workFinal, newdata)
plot(x = evalValencia$energy_demand, y=predict.eval, xlab="actual energy demand for the evaluation set",
ylab="predicted energy demand for the evaluation set", main = "Valencia")
abline(a=0, b=1, col="blue")
grid(10,10)
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("ValenciaEvalActComp.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,1))
plot(x = evalValencia$energy_demand, y=predict.eval, xlab="Actual energy demand for the evaluation set",
ylab="Predicted energy demand for the evaluation set", main = "Valencia", cex.lab = 0.75)
abline(a=0, b=1, col="blue")
grid(10,10)
# Closing the graphical device
dev.off()
null device
1
rootmse <- rmse(evalValencia$energy_demand, predict.eval)
rootmse <- rmse(evaldata$energy_demand, predict.eval)
print(paste("The root mean square error of the predicted vs actual",nrow(evaldata),"hourly energy_data points for Valencia over the 2015-2018 timespan is an energy demand difference of",round(rootmse,2),"MWh."))
[1] "The root mean square error of the predicted vs actual 6927 hourly energy_data points for Valencia over the 2015-2018 timespan is an energy demand difference of 73.9 MWh."
cat("\n")
demandcompdf <- data.frame("Actual (MWh)" = evalValencia$energy_demand,"Predicted (MWh)" = predict.eval)
head(demandcompdf,4L)
cat("\n")
print(paste("For example, the first actual demand value is",round(evaldata$energy_demand[1],2),"MWh and the corresponding predicted value is",round(predict.eval[1],2),"MWh. The percent error between this predicted value relative to the actual value is",round((100*(predict.eval[1] - evaldata$energy_demand[1])/evaldata$energy_demand[1]),2),"%."))
[1] "For example, the first actual demand value is 1359.45 MWh and the corresponding predicted value is 1087.2 MWh. The percent error between this predicted value relative to the actual value is -20.03 %."
cat("\n")
diffpct <- c((abs(evaldata$energy_demand - predict.eval))/evaldata$energy_demand)
print(paste("The mean difference between the",nrow(evaldata),"evaluation data set actual and model predicted values is",round(100*mean(diffpct),2),"%."))
[1] "The mean difference between the 6927 evaluation data set actual and model predicted values is 3.6 %."
Obtain X matrix from the model
Predictors <-rownames(summary(workFinal)$coefficients[,])
workpredictors <- as.data.frame(Predictors)
X <- model.matrix(workFinal)
class(X)
[1] "matrix" "array"
cat("\n")
dim(X)
[1] 27705 22
cat("\n")
head(X, 4L)
(Intercept) E_1 E_25 temp humidity pressure wind_speed rain_duration factor(day_night)2 factor(time_band)2 factor(time_band)3
1 1 1359.453 1256.750 270.475 77 1001 1 0 1 0 0
2 1 1115.686 1230.321 270.292 71 1004 2 0 1 0 0
3 1 1136.149 1241.974 270.292 71 1004 2 0 1 0 0
4 1 1120.369 1283.792 270.292 71 1004 2 0 1 0 0
factor(season)2 factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)5 factor(weather_main)6
1 0 0 1 0 0 0 0
2 0 0 1 0 0 0 0
3 0 0 1 0 0 0 0
4 0 0 1 0 0 0 0
factor(weather_main)7 factor(weather_main)8 factor(weather_main)9 factor(weather_main)12
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
Create x_new matrix using the validation data. Note that the purpose of running the evaluation regression here is to extract all of the correct predictor coefficients.
evalFinal <- lm(energy_demand ~ E_1 + E_25 + temp + humidity + pressure + wind_speed + rain_duration + factor(day_night) + factor(time_band) + factor(season) + factor(weather_main), data = evalValencia)
Predictors <-rownames(summary(evalFinal)$coefficients[,])
evalpredictors <- as.data.frame(Predictors)
x_new1 <- model.matrix(evalFinal)
cat("\n")
dim(x_new1)
[1] 6927 22
cat("\n")
#Check if there's a predictor difference between the work and eval matrices
workevalcompare <-anti_join(workpredictors,evalpredictors)
Joining, by = "Predictors"
cat("\n")
#If workevalcompare is 'No data available in table', run the next line of code and skip to line 1960
x_new <- x_new1
#Otherwise when workevalcompare is not 'No data available in table', run the following lines of code after identifying which column is missing between X and x_new1 (no discrepancy was found between workValencia but not in evalValencia).
#x_new_ <-as.data.frame(cbind(x_new1,rep(0,nrow(x_new1))))
#dummy_xnew<- x_new_ %>% relocate(V21,.before = 16) %>% rename("factor(weather_main)4" = V21)
#x_new <- as.matrix(dummy_xnew)
cat("\n")
class(x_new)
[1] "matrix" "array"
cat("\n")
dim(x_new)
[1] 6927 22
cat("\n")
head(x_new, 4L)
(Intercept) E_1 E_25 temp humidity pressure wind_speed rain_duration factor(day_night)2 factor(time_band)2 factor(time_band)3
1 1 1109.831 1231.994 270.475 77 1001 1 0 1 0 0
2 1 1312.896 1557.109 274.601 71 1005 1 0 0 0 1
3 1 1345.570 1620.951 284.824 55 1006 1 0 0 0 1
4 1 1562.295 1619.445 279.984 68 1035 1 0 1 0 1
factor(season)2 factor(season)3 factor(season)4 factor(weather_main)2 factor(weather_main)3 factor(weather_main)5 factor(weather_main)6
1 0 0 1 0 0 0 0
2 0 0 1 0 0 0 0
3 0 0 1 0 0 0 0
4 0 0 1 0 0 0 0
factor(weather_main)7 factor(weather_main)8 factor(weather_main)9 factor(weather_main)12
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
Compute the hat values for the validation data
h_new_mid <- solve(t(X)%*%X)
dim(h_new_mid)
[1] 22 22
h_new <- x_new%*%h_new_mid%*%t(x_new)
dim(h_new)
[1] 6927 6927
Plot the hatvalues for the data in the evaluation set
#cutlev2 = (2*length(coefficients(evalFinal)))/nrow(evalValencia)
# Count and assess the number of leverage values > cut-off
potoutlier2 <- sum(diag(h_new) > cutlev, na.rm=TRUE)
totcount2 = nrow(evalValencia)
plot(diag(h_new), type = "h", ylab = "Leverage for the validation data set", ylim = c(0,2*cutlev), main = "Valencia")
abline(h=cutlev, col="red", lty=2)
print(paste("The number of leverage points that are potential outliers is", potoutlier2,". This is", round(100*potoutlier2/totcount2,2),"% of the total number of predicted values, (", totcount2,") thus not material."))
[1] "The number of leverage points that are potential outliers is 232 . This is 3.35 % of the total number of predicted values, ( 6927 ) thus not material."
Side-by-side comparision
par(mfrow=c(1,2))
plot(lev, type="h", ylim = c(0, 2*cutlev), ylab="Leverage for the working data set", main = "Valencia")
abline(h = cutlev, col = "red", lty=2)
plot(diag(h_new), type = "h", ylim = c(0,2*cutlev), ylab = "Leverage for the evaluation data set", main = "Valencia")
abline(h=cutlev, col="red", lty=2)
print(paste("The leverage cutoff exceedences in the evaluation set is",round(100*potoutlier2/totcount2,2),"% of the total number of observed values (", totcount2,") and is comparable to that of the working set,", round(100*potoutlier/totcount,2),"% of the total number of predicted values (", totcount,")."))
[1] "The leverage cutoff exceedences in the evaluation set is 3.35 % of the total number of observed values ( 6927 ) and is comparable to that of the working set, 3.38 % of the total number of predicted values ( 27705 )."
Prepare report figure
# Opening the graphical device
# Customizing the output
pdf("ValenciaLevComp.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
par(mfrow=c(1,2))
plot(lev, type="h", ylab = "Leverage", ylim = c(0,2*cutlev), yaxt = 'n', main = "Valencia Working Data", res =600)
abline(h = cutlev, col = "red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
plot(diag(h_new), type = "h", ylab = "", ylim = c(0,2*cutlev), yaxt = 'n', main = "Valencia Evaluation Data")
abline(h=cutlev, col="red", lty=2)
axis(side = 2, at=seq(0,2*round(cutlev,5),by=round(cutlev,5)))
# Closing the graphical device
dev.off()
null device
1
# import data raw data set
w_BCN_raw <- read.csv("data/working_Barcelona.csv")
e_BCN_raw <- read.csv("data/evaluation_Barcelona.csv")
w_BCN <- w_BCN_raw[,-c(1,2)]
e_BCN <- e_BCN_raw[,-c(1,2)]
# changes of data type
class(w_BCN$day_night) = "category"
class(w_BCN$time_band) = "category"
class(w_BCN$season) = "category"
class(w_BCN$weather_main) = "category"
class(e_BCN$day_night) = "category"
class(e_BCN$time_band) = "category"
class(e_BCN$season) = "category"
class(e_BCN$weather_main) = "category"
# check data types
str(w_BCN)
str(e_BCN)
summary(w_BCN)
# fit random forest model
# default RF model
set.seed(1234)
rf <- randomForest(formula = energy_demand ~ ., data = w_BCN)
# find the num of trees in the forest with smallest MSE
plot(rf)
grid(10,10)
summary(rf)
testBCN <- rf$mse
testBCN <- as.data.frame(testBCN)
threshold = 0.05 # Threshold Error Margin (5 %)
#Visually, the slope become flat around 100 in the plot, divide test dataset at 100
InflectionValue <- testBCN[nrow(testBCN),] + threshold*testBCN[nrow(testBCN),]
InflectionIndex <- length(testBCN[testBCN>InflectionValue]) + 1
# modify the model with best mtry and importance of the predictors
mtry <- tuneRF(w_BCN,w_BCN$energy_demand, ntreeTry=InflectionIndex,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
rf_modified <-randomForest(energy_demand~.,data=w_BCN, mtry = best.m, importance=TRUE,ntree=InflectionIndex)
varImpPlot(randomForest(energy_demand~.,data=w_BCN, mtry = best.m, importance=TRUE,ntree=InflectionIndex), main = "Importance of the variables", n.var = 13)
pdf("VarImportance_BCN.pdf", # File name
width = 14, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
varImpPlot(randomForest(energy_demand~.,data=w_BCN, mtry = best.m, importance=TRUE,ntree=InflectionIndex), main = "Importance of the variables", n.var = 13)
# Closing the graphical device
dev.off()
# check the time
# system.time(randomForest(energy_demand~.,data=w_BCN, mtry = best.m, importance=TRUE,ntree=InflectionIndex))
# assess the test set performance of rf_modified with Metrics library
predvalue <- predict(rf_modified, e_BCN)
rmse(e_BCN$energy_demand, predvalue)
pdf("BCN1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(e_BCN$energy_demand, predvalue, xlab = "Actual energy demand for the evaluation set ", ylab = "Predicted energy demand for the evaluation set", main = "Barcelona (Tree)", cex.lab=0.75)
abline(a=0,b=1,col="blue")
grid(10,10)
dev.off()
pred.eval <- predvalue
diffpct <- c((abs(evaldata$energy_demand - predict.eval))/evaldata$energy_demand)
print(paste("The mean difference between the",nrow(evaldata),"evaluation data set actual and model predicted values is",round(100*mean(diffpct),2),"%."))
[1] "The mean difference between the 6927 evaluation data set actual and model predicted values is 3.6 %."
# create RF model with ranger package
ranger(formula = energy_demand ~ ., data = w_BCN)
# check execution time
system.time(rf_ranger <- ranger(formula = energy_demand ~ ., data = w_BCN))
# performance
rmse(
e_BCN$energy_demand,
predict(rf_ranger, e_BCN)$predictions
)
pdf("BCN2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(e_BCN$energy_demand,
predict(rf_ranger, e_BCN)$predictions, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "Ranger for Barcelona")
grid(10,10)
dev.off()
# import data raw data set
w_BLB_raw <- read.csv("data/working_Bilbao.csv")
e_BLB_raw <- read.csv("data/evaluation_Bilbao.csv")
w_BLB <- w_BLB_raw[,-c(1,2)]
e_BLB <- e_BLB_raw[,-c(1,2)]
# changes of data type
class(w_BLB$day_night) = "category"
class(w_BLB$time_band) = "category"
class(w_BLB$season) = "category"
class(w_BLB$weather_main) = "category"
class(e_BLB$day_night) = "category"
class(e_BLB$time_band) = "category"
class(e_BLB$season) = "category"
class(e_BLB$weather_main) = "category"
# check data types
str(w_BLB)
str(e_BLB)
summary(w_BLB)
# fit random forest model
# default RF model
set.seed(1234)
rf2 <- randomForest(formula = energy_demand ~ ., data = w_BLB)
# find the num of trees in the forest with smallest MSE
plot(rf2)
summary(rf2)
n2 <- which.min(rf2$mse)
testBLB <- rf2$mse
testBLB <- as.data.frame(testBLB)
threshold = 0.05 # Threshold Error Margin (5 %)
#Visually, the slope become flat around 100 in the plot, divide test dataset at 100
InflectionValue <- testBLB[nrow(testBLB),] + threshold*testBLB[nrow(testBLB),]
InflectionIndex <- length(testBLB[testBLB>InflectionValue]) + 1
# modify the model with best mtry and importance of the predictors
mtry <- tuneRF(w_BLB,w_BLB$energy_demand, ntreeTry=InflectionIndex,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
rf2_modified <-randomForest(energy_demand~.,data=w_BLB, mtry = best.m, importance=TRUE,ntree=InflectionIndex)
varImpPlot(randomForest(energy_demand~.,data=w_BLB, mtry = best.m, importance=TRUE,ntree=InflectionIndex), main = "Importance of the varibales", n.var = 13)
# check the time
system.time(randomForest(formula = energy_demand ~ .,
data = w_BLB, ntree = n2, na.action=na.exclude))
# assess the test set performance of rf_modified with Metrics library
predvalue <- predict(rf2_modified, e_BLB)
rmse(e_BLB$energy_demand, predvalue)
pdf("BLB1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(e_BLB$energy_demand, predvalue, xlab = "Energy Demand of Test Set ", ylab = "Prediction", main = "rF for Bilbao")
grid(10,10)
dev.off()
plot(e_BLB$energy_demand, predvalue, xlab = "Energy Demand of Test Set ", ylab = "Prediction", main = "rF for Bilbao")
# create RF model with ranger package
rf2_ranger <- ranger(formula = energy_demand ~ ., data = w_BLB)
# check execution time
# system.time(rf2_ranger <- ranger(formula = energy_demand ~ ., data = w_BLB))
# performance
rmse(
e_BLB$energy_demand,
predict(rf2_ranger, e_BLB)$predictions
)
pdf("BLB2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(
e_BLB$energy_demand,
predict(rf2_ranger, e_BLB)$predictions, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "Ranger for Bilbao"
)
grid(10,10)
dev.off()
plot(
e_BLB$energy_demand,
predict(rf2_ranger, e_BLB)$predictions, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "Ranger for Bilbao"
)
# import data raw data set
w_MDD_raw <- read.csv("data/working_Madrid.csv")
e_MDD_raw <- read.csv("data/evaluation_Madrid.csv")
w_MDD <- w_MDD_raw[,-c(1,2)]
e_MDD <- e_MDD_raw[,-c(1,2)]
# changes of data type
class(w_MDD$day_night) = "category"
class(w_MDD$time_band) = "category"
class(w_MDD$season) = "category"
class(w_MDD$weather_main) = "category"
class(e_MDD$day_night) = "category"
class(e_MDD$time_band) = "category"
class(e_MDD$season) = "category"
class(e_MDD$weather_main) = "category"
# check data types
str(w_MDD)
str(e_MDD)
summary(w_MDD)
# fit random forest model
# default RF model
set.seed(1234)
rf3 <- randomForest(formula = energy_demand ~ ., data = w_MDD)
# find the num of trees in the forest with smallest MSE
plot(rf3)
summary(rf3)
testMDD <- rf3$mse
testMDD <- as.data.frame(testMDD)
#the slope become flat around 100 in the plot, divide test dataset at 100
threshold = 0.05 # Threshold Error Margin (5 %)
#Visually, the slope become flat around 100 in the plot, divide test dataset at 100
InflectionValue <- testMDD[nrow(testMDD),] + threshold*testMDD[nrow(testMDD),]
InflectionIndex <- length(testMDD[testMDD>InflectionValue]) + 1
# modify the model with best mtry and importance of the predictors
mtry <- tuneRF(w_MDD, w_MDD$energy_demand, ntreeTry=InflectionIndex,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
rf3_modified <-randomForest(energy_demand~.,data=w_MDD, mtry = best.m, importance=TRUE,ntree=InflectionIndex)
varImpPlot(randomForest(energy_demand~.,data=w_MDD, mtry = best.m, importance=TRUE,ntree=InflectionIndex), main = "Importance of the variables", n.var = 13)
# check the time
#system.time(randomForest(formula = energy_demand ~ .,
# data = w_MDD, ntree = 100, na.action=na.exclude))
# assess the test set performance of rf_modified with Metrics library
predvalue <- predict(rf3_modified, e_MDD)
rmse(e_MDD$energy_demand, predvalue)
pdf("MDD1.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(e_MDD$energy_demand, predvalue, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "rF for Madrid")
grid(10,10)
dev.off()
plot(e_MDD$energy_demand, predvalue, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "rF for Madrid")
# create RF model with ranger package
rf3_ranger <- ranger(formula = energy_demand ~ ., data = w_MDD)
# check execution time
# system.time(rf3_ranger <- ranger(formula = energy_demand ~ ., data = w_MDD))
# performance
rmse(
e_MDD$energy_demand,
predict(rf3_ranger, e_MDD)$predictions
)
pdf("MDD2.pdf", # File name
width = 7, height = 4, # Width and height in inches
bg = "white", # Background color
colormodel = "cmyk") # Color model
plot(
e_MDD$energy_demand,
predict(rf2_ranger, e_MDD)$predictions, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "Ranger for Madrid"
)
grid(10,10)
dev.off()
plot(
e_MDD$energy_demand,
predict(rf3_ranger, e_MDD)$predictions, ylab = "Prediction", xlab = "Energy Demand of Test Set", main = "Ranger for Madrid"
)
# import data raw data set
w_SVL_raw <- read.csv("data/working_Seville.csv")
e_SVL_raw <- read.csv("data/evaluation_Seville.csv")
w_SVL <- w_SVL_raw[,-c(1,2)]
e_SVL <- e_SVL_raw[,-c(1,2)]
# changes of data type
class(w_SVL$day_night) = "category"
class(w_SVL$time_band) = "category"
class(w_SVL$season) = "category"
class(w_SVL$weather_main) = "category"
class(e_SVL$day_night) = "category"
class(e_SVL$time_band) = "category"
class(e_SVL$season) = "category"
class(e_SVL$weather_main) = "category"
# check data types
str(w_SVL)
str(e_SVL)
summary(w_SVL)
# fit random forest model
# default RF model
set.seed(1234)
rf4 <- randomForest(formula = energy_demand ~ ., data = w_SVL)
# find the num of trees in the forest with smallest MSE
plot(rf4)
testSVL <- rf4$mse
testSVL <- as.data.frame(testSVL)
#the slope become flat around 100 in the plot, divide test dataset at 100
threshold = 0.05 # Threshold Error Margin (5 %)
#Visually, the slope become flat around 100 in the plot, divide test dataset at 100
InflectionValue <- testSVL[nrow(testSVL),] + threshold*testSVL[nrow(testSVL),]
InflectionIndex <- length(testSVL[testSVL>InflectionValue]) + 1
# modify the model with best mtry and importance of the predictors
mtry <- tuneRF(w_SVL, w_SVL$energy_demand, ntreeTry=InflectionIndex,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
rf4_modified <-randomForest(energy_demand~.,data=w_SVL, mtry = best.m, importance=TRUE,ntree=InflectionIndex)
varImpPlot(randomForest(energy_demand~.,data=w_SVL, mtry = best.m, importance=TRUE,ntree=InflectionIndex), main = "Importance of the variables", n.var = 13)
# check the time
# system.time(randomForest(formula = energy_demand ~ .,
# data = w_SVL, ntree = n4, na.action=na.exclude))
# assess the test set performance of rf_modified with Metrics library
predvalue <- predict(rf4_modified, e_SVL)
rmse(e_SVL$energy_demand, predvalue)
plot(e_SVL$energy_demand, predvalue, main = "Seville1")
# create RF model with ranger package
rf4_ranger <- ranger(formula = energy_demand ~ ., data = w_SVL)
# check execution time
# system.time(rf4_ranger <- ranger(formula = energy_demand ~ ., data = w_SVL))
# performance
rmse(
e_SVL$energy_demand,
predict(rf4_ranger, e_SVL)$predictions
)
plot(
e_SVL$energy_demand,
predict(rf4_ranger, e_SVL)$predictions, xlab = "predvalue", main = "Seville2"
)
# import data raw data set
w_VLC_raw <- read.csv("data/working_Valencia.csv")
e_VLC_raw <- read.csv("data/evaluation_Valencia.csv")
w_VLC <- w_VLC_raw[,-c(1,2)]
e_VLC <- e_VLC_raw[,-c(1,2)]
# changes of data type
class(w_VLC$day_night) = "category"
class(w_VLC$time_band) = "category"
class(w_VLC$season) = "category"
class(w_VLC$weather_main) = "category"
class(e_VLC$day_night) = "category"
class(e_VLC$time_band) = "category"
class(e_VLC$season) = "category"
class(e_VLC$weather_main) = "category"
# check data types
str(w_VLC)
str(e_VLC)
summary(w_VLC)
# fit random forest model
# default RF model
set.seed(1234)
rf5 <- randomForest(formula = energy_demand ~ ., data = w_VLC)
# find the num of trees in the forest with smallest MSE
plot(rf5)
testVLC <- rf5$mse
testVLC <- as.data.frame(testVLC)
#the slope become flat around 100 in the plot, divide test dataset at 100
threshold = 0.05 # Threshold Error Margin (5 %)
#Visually, the slope become flat around 100 in the plot, divide test dataset at 100
InflectionValue <- testVLC[nrow(testVLC),] + threshold*testVLC[nrow(testVLC),]
InflectionIndex <- length(testVLC[testVLC>InflectionValue]) + 1
# modify the model with best mtry and importance of the predictors
mtry <- tuneRF(w_VLC, w_VLC$energy_demand, ntreeTry=InflectionIndex,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
rf5_modified <-randomForest(energy_demand~.,data=w_VLC, mtry = best.m, importance=TRUE,ntree=InflectionIndex)
varImpPlot(randomForest(energy_demand~.,data=w_VLC, mtry = best.m, importance=TRUE,ntree=InflectionIndex), main = "Importance of the variables", n.var = 13)
# check the time
# system.time(randomForest(formula = energy_demand ~ .,
# data = w_VLC, ntree = n5, na.action=na.exclude))
# assess the test set performance of rf_modified with Metrics library
predvalue <- predict(rf5_modified, e_VLC)
rmse(e_VLC$energy_demand, predvalue)
plot(e_VLC$energy_demand, predvalue, main = "Valencia1")
# create RF model with ranger package
rf5_ranger <- ranger(formula = energy_demand ~ ., data = w_VLC)
# check execution time
# system.time(rf5_ranger <- ranger(formula = energy_demand ~ ., data = w_VLC))
# performance
rmse(
e_VLC$energy_demand,
predict(rf5_ranger, e_VLC)$predictions
)
plot(
e_VLC$energy_demand,
predict(rf5_ranger, e_VLC)$predictions, xlab = "predvalue", main = "Valencia2"
)